From 94a575d1b9714c90c275856c07274d212b2189cc Mon Sep 17 00:00:00 2001 From: RickNie Date: Wed, 20 May 2026 16:53:47 +0800 Subject: [PATCH 1/7] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81Prompt=E8=87=AA?= =?UTF-8?q?=E4=BC=98=E5=8C=96AgentOptimizer=20(#68)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../assets/imgs/optimization_quickstart.png | Bin 0 -> 307225 bytes docs/mkdocs/en/optimization.md | 2030 ++++++++++++++++ docs/mkdocs/zh/optimization.md | 2038 +++++++++++++++++ .../advanced_strategies/README.md | 206 ++ .../advanced_strategies/agent/__init__.py | 0 .../advanced_strategies/agent/agent.py | 134 ++ .../advanced_strategies/agent/config.py | 33 + .../agent/prompts/system.md | 1 + .../advanced_strategies/compare.py | 100 + .../data/train.evalset.json | 73 + .../advanced_strategies/data/val.evalset.json | 73 + .../optimizer_advanced.json | 48 + .../optimizer_baseline.json | 45 + .../advanced_strategies/run_advanced.py | 70 + .../advanced_strategies/run_baseline.py | 65 + examples/optimization/blackbox_cli/README.md | 205 ++ .../blackbox_cli/agent/__init__.py | 10 + .../blackbox_cli/agent/call_agent.py | 141 ++ .../optimization/blackbox_cli/optimizer.json | 45 + .../blackbox_cli/run_optimization.py | 89 + .../blackbox_cli/train.evalset.json | 62 + .../blackbox_cli/val.evalset.json | 40 + .../.claude/skills/city-info/SKILL.md | 6 + .../blackbox_cli/workspace/CLAUDE.md | 1 + .../optimization/ci_integration/README.md | 243 ++ .../ci_integration/agent/__init__.py | 0 .../ci_integration/agent/agent.py | 156 ++ .../ci_integration/agent/config.py | 33 + .../ci_integration/agent/prompts/skill.md | 1 + .../ci_integration/agent/prompts/system.md | 10 + .../ci_integration/ci/run_nightly_optimize.sh | 20 + .../ci_integration/ci/run_pr_check.sh | 12 + .../ci_integration/data/test_config.json | 14 + .../ci_integration/data/train.evalset.json | 40 + .../ci_integration/data/val.evalset.json | 40 + .../ci_integration/optimizer.json | 45 + .../ci_integration/run_optimization.py | 97 + .../ci_integration/tests/__init__.py | 0 .../tests/test_agent_quality.py | 62 + examples/optimization/http_service/README.md | 197 ++ .../optimization/http_service/optimizer.json | 45 + .../http_service/run_optimization.py | 123 + .../http_service/service/__init__.py | 5 + .../http_service/service/prompts/system.md | 1 + .../http_service/service/server.py | 157 ++ .../http_service/train.evalset.json | 112 + .../http_service/val.evalset.json | 70 + .../multi_agent_pipeline/README.md | 191 ++ .../multi_agent_pipeline/optimizer.json | 46 + .../multi_agent_pipeline/pipeline/__init__.py | 5 + .../multi_agent_pipeline/pipeline/config.py | 33 + .../pipeline/orchestrator.py | 131 ++ .../pipeline/prompts/fact_agent.md | 1 + .../pipeline/prompts/math_agent.md | 1 + .../pipeline/prompts/router.md | 1 + .../pipeline/prompts/summarizer.md | 1 + .../multi_agent_pipeline/run_optimization.py | 105 + .../multi_agent_pipeline/train.evalset.json | 92 + .../multi_agent_pipeline/val.evalset.json | 58 + .../multi_metric_with_judges/README.md | 241 ++ .../agent/__init__.py | 5 + .../multi_metric_with_judges/agent/agent.py | 48 + .../multi_metric_with_judges/agent/config.py | 33 + .../agent/prompts/system.md | 1 + .../multi_metric_with_judges/optimizer.json | 100 + .../run_optimization.py | 123 + .../train.evalset.json | 112 + .../multi_metric_with_judges/val.evalset.json | 70 + examples/optimization/quickstart/README.md | 213 ++ .../optimization/quickstart/agent/__init__.py | 5 + .../optimization/quickstart/agent/agent.py | 103 + .../optimization/quickstart/agent/config.py | 33 + .../quickstart/agent/prompts/skill.md | 1 + .../quickstart/agent/prompts/system.md | 1 + .../optimization/quickstart/optimizer.json | 88 + .../quickstart/run_optimization.py | 167 ++ .../quickstart/train.evalset.json | 112 + .../optimization/quickstart/val.evalset.json | 70 + .../remote_prompt_store/README.md | 208 ++ .../remote_prompt_store/agent/__init__.py | 5 + .../remote_prompt_store/agent/agent.py | 48 + .../remote_prompt_store/agent/config.py | 33 + .../remote_prompt_store/optimizer.json | 45 + .../remote_prompt_store/run_optimization.py | 161 ++ .../remote_prompt_store/store/__init__.py | 5 + .../store/fake_kv_store.py | 53 + .../store/prompt_client.py | 85 + .../remote_prompt_store/store/store.json | 4 + .../remote_prompt_store/train.evalset.json | 112 + .../remote_prompt_store/val.evalset.json | 70 + .../slo_runtime_control/README.md | 218 ++ .../slo_runtime_control/agent/__init__.py | 10 + .../slo_runtime_control/agent/agent.py | 47 + .../slo_runtime_control/agent/config.py | 33 + .../agent/prompts/system.md | 1 + .../slo_runtime_control/optimizer.json | 48 + .../slo_runtime_control/run_optimization.py | 143 ++ .../slo_runtime_control/train.evalset.json | 239 ++ .../slo_runtime_control/val.evalset.json | 123 + pyproject.toml | 7 + requirements-test.txt | 2 + requirements.txt | 2 + tests/evaluation/test_agent_optimizer.py | 1285 +++++++++++ tests/evaluation/test_base_optimizer.py | 240 ++ tests/evaluation/test_optimize_config.py | 629 +++++ .../test_optimize_evaluator_call.py | 613 +++++ .../evaluation/test_optimize_gepa_adapter.py | 1748 ++++++++++++++ .../evaluation/test_optimize_gepa_callback.py | 667 ++++++ tests/evaluation/test_optimize_gepa_e2e.py | 210 ++ .../test_optimize_gepa_reflective.py | 1628 +++++++++++++ tests/evaluation/test_optimize_metric_info.py | 630 +++++ .../test_optimize_model_callable.py | 261 +++ .../evaluation/test_optimize_model_options.py | 113 + .../test_optimize_quickstart_example.py | 489 ++++ tests/evaluation/test_optimize_registry.py | 109 + tests/evaluation/test_optimize_reporter.py | 611 +++++ tests/evaluation/test_optimize_result.py | 456 ++++ tests/evaluation/test_remote_eval_service.py | 30 + tests/evaluation/test_target_prompt.py | 539 +++++ trpc_agent_sdk/evaluation/__init__.py | 62 + trpc_agent_sdk/evaluation/_agent_evaluator.py | 33 +- trpc_agent_sdk/evaluation/_agent_optimizer.py | 614 +++++ trpc_agent_sdk/evaluation/_base_optimizer.py | 123 + trpc_agent_sdk/evaluation/_optimize_config.py | 257 +++ .../evaluation/_optimize_evaluator_call.py | 136 ++ .../evaluation/_optimize_gepa_adapter.py | 794 +++++++ .../evaluation/_optimize_gepa_callback.py | 381 +++ .../evaluation/_optimize_gepa_reflective.py | 612 +++++ .../evaluation/_optimize_metric_info.py | 534 +++++ .../evaluation/_optimize_model_callable.py | 309 +++ .../evaluation/_optimize_model_options.py | 45 + .../evaluation/_optimize_registrations.py | 22 + .../evaluation/_optimize_registry.py | 41 + .../evaluation/_optimize_reporter.py | 1001 ++++++++ trpc_agent_sdk/evaluation/_optimize_result.py | 361 +++ .../evaluation/_remote_eval_service.py | 9 + trpc_agent_sdk/evaluation/_target_prompt.py | 243 ++ 137 files changed, 26724 insertions(+), 2 deletions(-) create mode 100644 docs/mkdocs/assets/imgs/optimization_quickstart.png create mode 100644 docs/mkdocs/en/optimization.md create mode 100644 docs/mkdocs/zh/optimization.md create mode 100644 examples/optimization/advanced_strategies/README.md create mode 100644 examples/optimization/advanced_strategies/agent/__init__.py create mode 100644 examples/optimization/advanced_strategies/agent/agent.py create mode 100644 examples/optimization/advanced_strategies/agent/config.py create mode 100644 examples/optimization/advanced_strategies/agent/prompts/system.md create mode 100644 examples/optimization/advanced_strategies/compare.py create mode 100644 examples/optimization/advanced_strategies/data/train.evalset.json create mode 100644 examples/optimization/advanced_strategies/data/val.evalset.json create mode 100644 examples/optimization/advanced_strategies/optimizer_advanced.json create mode 100644 examples/optimization/advanced_strategies/optimizer_baseline.json create mode 100644 examples/optimization/advanced_strategies/run_advanced.py create mode 100644 examples/optimization/advanced_strategies/run_baseline.py create mode 100644 examples/optimization/blackbox_cli/README.md create mode 100644 examples/optimization/blackbox_cli/agent/__init__.py create mode 100644 examples/optimization/blackbox_cli/agent/call_agent.py create mode 100644 examples/optimization/blackbox_cli/optimizer.json create mode 100644 examples/optimization/blackbox_cli/run_optimization.py create mode 100644 examples/optimization/blackbox_cli/train.evalset.json create mode 100644 examples/optimization/blackbox_cli/val.evalset.json create mode 100644 examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md create mode 100644 examples/optimization/blackbox_cli/workspace/CLAUDE.md create mode 100644 examples/optimization/ci_integration/README.md create mode 100644 examples/optimization/ci_integration/agent/__init__.py create mode 100644 examples/optimization/ci_integration/agent/agent.py create mode 100644 examples/optimization/ci_integration/agent/config.py create mode 100644 examples/optimization/ci_integration/agent/prompts/skill.md create mode 100644 examples/optimization/ci_integration/agent/prompts/system.md create mode 100755 examples/optimization/ci_integration/ci/run_nightly_optimize.sh create mode 100755 examples/optimization/ci_integration/ci/run_pr_check.sh create mode 100644 examples/optimization/ci_integration/data/test_config.json create mode 100644 examples/optimization/ci_integration/data/train.evalset.json create mode 100644 examples/optimization/ci_integration/data/val.evalset.json create mode 100644 examples/optimization/ci_integration/optimizer.json create mode 100644 examples/optimization/ci_integration/run_optimization.py create mode 100644 examples/optimization/ci_integration/tests/__init__.py create mode 100644 examples/optimization/ci_integration/tests/test_agent_quality.py create mode 100644 examples/optimization/http_service/README.md create mode 100644 examples/optimization/http_service/optimizer.json create mode 100644 examples/optimization/http_service/run_optimization.py create mode 100644 examples/optimization/http_service/service/__init__.py create mode 100644 examples/optimization/http_service/service/prompts/system.md create mode 100644 examples/optimization/http_service/service/server.py create mode 100644 examples/optimization/http_service/train.evalset.json create mode 100644 examples/optimization/http_service/val.evalset.json create mode 100644 examples/optimization/multi_agent_pipeline/README.md create mode 100644 examples/optimization/multi_agent_pipeline/optimizer.json create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/__init__.py create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/config.py create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md create mode 100644 examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md create mode 100644 examples/optimization/multi_agent_pipeline/run_optimization.py create mode 100644 examples/optimization/multi_agent_pipeline/train.evalset.json create mode 100644 examples/optimization/multi_agent_pipeline/val.evalset.json create mode 100644 examples/optimization/multi_metric_with_judges/README.md create mode 100644 examples/optimization/multi_metric_with_judges/agent/__init__.py create mode 100644 examples/optimization/multi_metric_with_judges/agent/agent.py create mode 100644 examples/optimization/multi_metric_with_judges/agent/config.py create mode 100644 examples/optimization/multi_metric_with_judges/agent/prompts/system.md create mode 100644 examples/optimization/multi_metric_with_judges/optimizer.json create mode 100644 examples/optimization/multi_metric_with_judges/run_optimization.py create mode 100644 examples/optimization/multi_metric_with_judges/train.evalset.json create mode 100644 examples/optimization/multi_metric_with_judges/val.evalset.json create mode 100644 examples/optimization/quickstart/README.md create mode 100644 examples/optimization/quickstart/agent/__init__.py create mode 100644 examples/optimization/quickstart/agent/agent.py create mode 100644 examples/optimization/quickstart/agent/config.py create mode 100644 examples/optimization/quickstart/agent/prompts/skill.md create mode 100644 examples/optimization/quickstart/agent/prompts/system.md create mode 100644 examples/optimization/quickstart/optimizer.json create mode 100644 examples/optimization/quickstart/run_optimization.py create mode 100644 examples/optimization/quickstart/train.evalset.json create mode 100644 examples/optimization/quickstart/val.evalset.json create mode 100644 examples/optimization/remote_prompt_store/README.md create mode 100644 examples/optimization/remote_prompt_store/agent/__init__.py create mode 100644 examples/optimization/remote_prompt_store/agent/agent.py create mode 100644 examples/optimization/remote_prompt_store/agent/config.py create mode 100644 examples/optimization/remote_prompt_store/optimizer.json create mode 100644 examples/optimization/remote_prompt_store/run_optimization.py create mode 100644 examples/optimization/remote_prompt_store/store/__init__.py create mode 100644 examples/optimization/remote_prompt_store/store/fake_kv_store.py create mode 100644 examples/optimization/remote_prompt_store/store/prompt_client.py create mode 100644 examples/optimization/remote_prompt_store/store/store.json create mode 100644 examples/optimization/remote_prompt_store/train.evalset.json create mode 100644 examples/optimization/remote_prompt_store/val.evalset.json create mode 100644 examples/optimization/slo_runtime_control/README.md create mode 100644 examples/optimization/slo_runtime_control/agent/__init__.py create mode 100644 examples/optimization/slo_runtime_control/agent/agent.py create mode 100644 examples/optimization/slo_runtime_control/agent/config.py create mode 100644 examples/optimization/slo_runtime_control/agent/prompts/system.md create mode 100644 examples/optimization/slo_runtime_control/optimizer.json create mode 100644 examples/optimization/slo_runtime_control/run_optimization.py create mode 100644 examples/optimization/slo_runtime_control/train.evalset.json create mode 100644 examples/optimization/slo_runtime_control/val.evalset.json create mode 100644 tests/evaluation/test_agent_optimizer.py create mode 100644 tests/evaluation/test_base_optimizer.py create mode 100644 tests/evaluation/test_optimize_config.py create mode 100644 tests/evaluation/test_optimize_evaluator_call.py create mode 100644 tests/evaluation/test_optimize_gepa_adapter.py create mode 100644 tests/evaluation/test_optimize_gepa_callback.py create mode 100644 tests/evaluation/test_optimize_gepa_e2e.py create mode 100644 tests/evaluation/test_optimize_gepa_reflective.py create mode 100644 tests/evaluation/test_optimize_metric_info.py create mode 100644 tests/evaluation/test_optimize_model_callable.py create mode 100644 tests/evaluation/test_optimize_model_options.py create mode 100644 tests/evaluation/test_optimize_quickstart_example.py create mode 100644 tests/evaluation/test_optimize_registry.py create mode 100644 tests/evaluation/test_optimize_reporter.py create mode 100644 tests/evaluation/test_optimize_result.py create mode 100644 tests/evaluation/test_target_prompt.py create mode 100644 trpc_agent_sdk/evaluation/_agent_optimizer.py create mode 100644 trpc_agent_sdk/evaluation/_base_optimizer.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_config.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_evaluator_call.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_gepa_callback.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_metric_info.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_model_callable.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_model_options.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_registrations.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_registry.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_reporter.py create mode 100644 trpc_agent_sdk/evaluation/_optimize_result.py create mode 100644 trpc_agent_sdk/evaluation/_target_prompt.py diff --git a/docs/mkdocs/assets/imgs/optimization_quickstart.png b/docs/mkdocs/assets/imgs/optimization_quickstart.png new file mode 100644 index 0000000000000000000000000000000000000000..d68dd4dd6e0a4cf1001b4a505ea71cc808f74fff GIT binary patch literal 307225 zcma%j1zeNu-aoB~2#SKBgd(Ao%IGwZ?vh4gbdF8|3sFJ3K~kh7Bu0sdbR#gv0I4xz zlkV}q_MG$n-}7*u^FGJV2Vta86qMgYDI;IT0}&Y+(bm= zYNsi{clUxFfIA}L?yjfG2nrZ^7%~QvnjbVN&n|-@;_cl zKl-x~yo%CQ!2m`?M9)F^AXe16@tcT9mPqm8Jslt771XIz-PXO|YcD6`b6W-_ir{Rg zU!6XE=DMzI*tIjZkac-Fx+~XygraP}XKH3QGRZyrIL3WlmF(tqwg$e277}IL7Rtu; z9J&gHteQYg1xSwy+VrL-iA}-0>3*#x1yE8_GCT_)KV2`|3$2nXYw&HmW zE4dqtS{mtl#XZa8`HVU@`ebn3bygi07$to^!&p|8^!gZ9w**;8%J};$9b9Tlj%7O% z1TxyPa~ur?j{R5E)sR9>7_R*_{<@Lk_`dXj_Bt{7RQ*rLuSM+EE#pz6+T{UqwdkJ6 zoC?Vvon?uL$!P!OKep=A7rXgrzQO+Woq_i_cP_*k1y?jSPHk&3hV=-s zpvwZ*S1S6H3iXOp<$W-@30C`?nBLl^&X1beITo$2SrX1%x^+XyemE+@EXV-A*B9Iw zdsADoMBeX0uQ+>J%)xnCHevGd3AaePRFA{o*Tic4U;0MzDF?6lWKvTtC=Jv^t0Ch= zP%9H?$U>$P!d__3TcG}l9t78=dDo@8{c5Fn{J&iRM@rhvi$=JJpV$B8x)71nnr6yH z&`b9{uC&!GFl~T4BCC4PhR!ZNxQ3m8(3tuRg9qQ~p3v%7Z}Wz@8@-ym$3Dd2pP@yI^a`mDA2;Lqp3{X)nm-NOB}eB}iG ztY&fF0^5AH0WW}Cj zp`Ii0kt;676QA0SZmT9k-k)O@S7`9t%F!v%ijI!HzC2o*AkcKE<%OGE#o8~?NbhD2 zRe?p0`1~Yn6N@rR7INUsS4)dF3)+*L4BRovkJQ`W!3oTF#tAx6GeLgWjAYBx)UJ2( zq*d`5RkiDz1zPtW|9&YWx@tI3i{H$bNvc}tNr`Z#v5An_h!vmknSHw*t&({A-a5L^ zWdu>Ud zzfYoCw;eCuQ}SLb?p9Yw8qx~YS7F_{Go`pZh2q;=N5{+|WL`m+*Z5nN;^GSjvBk#6 zcf(5)+1hBBqhhHY=Fz2it~MI%4wfJ8Y4FQ}!)ke5t~|I+Z@{hZt# zlsV##TM~+W#0*#3;we!{Ij-~_x#`lhB&hq5e{>Tg(5f7FXI4kG{u^VdTRwJhdVITg zuXir496GYo>zfm&F^WkSXuQucu%-KHNLDGg!9S4orIuG`mP=E-6A!(F2_4ca>K3(t zw07C9mYRP9{it}KL=uVW`yIT=r;fj8u1d;D`M zhSwA`wgX?;sTY6qGrieZDlC01krK5n7IausoBuHULN6aSli4X?8)2i$c|AX@Fl)n( zC{p58e$dhW`Zm%1fqOwF{jQ z%x~Rke3qpD1)(tJsirJ!vPgIpT@)~#5M#Y-^@_50f0p%_8|)ePahslG4ku)lXRi~7 z;qvereycQCQA;bDd3^R`h%uZxt{C{#~*raP9=+bek#uM zm-JoFvHhJ`_iR6~Z08-L;3K63ekXO+Ow`v9(lnUSS`cehT6X?doT;Dot)C@mM50%f zZ^!+DpK~*~2y#oA7_sAn{%rM@k=5Lxz{p*c*2)Az@GhoD-wIW4z_N4byZMlA!rOf3 zT)2SQ0NS*{WrgQ~^>Vo!%MNP0`!-j5>+H8uuq2sd)iVNao4dv|`G?y1n-7y!pvK<` z@yz|6lyWyfUI}oPNGw;D-tWStZMz!dA@21Td1Kc`9B-zKbRDxC#0VyM&xff0xH4q2 ztk-3uE%JMUkt~?_bOHWmNTS@ zAB!BhxhDIc%4qy&*ZXb9x;vS#Mubc`k_twT^)esht|`LtoG@|hPaCV*Zy75b;on;s zpPl)0cPAn-V4FUk>Z~xg$orRm`A0SVYV-%UQwdFnKyXG zSj~Xo1C`9xwL@|)%n`iz!-JRhLh_e-lIPE<4Ypzq7LnC1RmFl#Qw()J!Oe=>Ol_Ow ze)v@}E>T6nX-AW?hmgzig4&4b8^P?uE}m%~zSAyhPX`){Q><3&S>;G=D=%_CR)FK@ zaa!>5V?NHV8#+RP?^qP#sO_4s9e6u?W0$w8(W`ofWmkf=$?$%*z zVl02My^WKQXG^ioUOQ0}XiJ_-bg6)*YWsq%E;s320b>-TFq+SM!tLe7_0E|O2tP<- zEAxug0WzCHT*z^(KRtOC?z!BpXUX(fr_4eQo$Ooztu}@+nZ#wwN4_r6t;Uxl2Qc$k zR9#o97{jqU{v~Tr)cSl^N}Wenc{GQzIJ@mgYJ>ldI2zuN$|v{wVwr9Xb*I?gmn`Im z+6XA@jE-79xzge`EP1|ND!5RO;dsUfN9CR7hkL^)*iG+kIJ%0oA1x{Vnb{R7gSV}c zfZ4oZk_oE!MFVIfGz)PW*O5Nj)Yhi)Zp1Yn^m3)d1(`Y5{Gy2bVo$+kl(iRFX1Sy} zbx>kb_eTGV{m+Sn@-HPa5+jyVje#r%-^*c7;3&Of9s#I=IBY5Bjn5*DyT`F~#SN6& zYE}MAU&r$pQ-?PHV+%DQy3sNH^yr=l2GIluv!7ai_$@}5%g)P;Sak{-_mWy3iIJk% zJL)ns5YMZ-nKw}V2(m_x%exGF_=eFmhbzHJTwOy|sYc>?k)yRzPM?n&zpfAF zC<_>@qeQl~hTq8zlH@?@hQ9w2NPc=t!u5HwXO8o@o^{o;<*eE9!F}(h1bQg}DCpW| zeezN~vAuLy@tzSI6hB2$x`ekLCw`fnhKAXwf+tR`X0vVsEg&BrRJphvKx!J;H>|Ic zabx4h8_3os{=?B`cCMA-NU{Iqp=bm4?s1Cxe5V?NAeRtkqKhjcp|Qe(W47n=WD@+F zr=s^ggkSE z{zE?qtzhs&7*1HiPqkVW#z5`2HysvXgj=L{HXL2KZwMh$s@@+oXAI8u#!cq!!Iw*7 zxH19;Fu+l3iF%FSjP^|(t8z2FpJp-2Xi#qBAZs>5%3qHq za8uKu);%{pIqRMqpJC}8j=!{)R4ZhB^@1TA0Us?+@0#bmAYG`d#=_PtKJS=7JU@#2wfczgsSFWvF^ z4)M}uk^YX5Wg{O*shWH-s(b)H7!?|s5E6z#us$I5uT!CMMm|8ujDtt6MmiyrZwXY3g* zF_DPm7iP0XRyk=k1qH%X1vhGx3$$`4^6xrNh@y>`Lr7vBh1rn?6;xZ8p|(`}ukWC8 z?g@TdOSS50aEY#WP$Kt)&8h_04pmi2l39I(cfVG1DE?R|7@%!6*XVb597cFCV`bTb zsnkE*naXP{GFJOPJm~Fa%FJm!;=eoFcLuWCE^&JiP=qLenMEDKM|N7z?cxt`{$yNs z2kBD&a{#D(F%jKA&#k{@(VuC$kJ&gcEj7}OZ+3aA6-SXE7AVBFD5BnwWXVQNF>6)dFOdLaU)?u7+mc>Sd=4rnjZh$#w=AO8TF8 zlZiWfZAy_}Z~C(oXG0SQ&5JgbAjX4NWDG~I8`$hIed)ft>D}FZ9{12eQIpkX{>{z7 zPwK&RD_*__YUQ_9C*loN1t|E_CB1aCgp0oPTnLX4REWBS!p5E;fwU4N`!?~(m;BMa zX`CDWY6i8(GPT_^Kkb+{07joe(~vbB_j6`jBV_vLq}6X4OG>RU^6P!ucomWVQmUm@ z$%n;lZS%8D&11?wcK&ph?XwI#zkI*94z;8MOf4B%|B>{+h1jCzX?jhrz0g1NnaiU!~L<1K?=-k5{^>^Ti37Y9*)b-k;788SE(%jNBl z4E5C;F!7fL11tIf9j@2-@BaS9f;_#{Y8z;_@(?H#mAdb)osQ)-Yu>DCnyPn^`s`5W z9U~^7oeLi_ZS>>IXZfgAW$&3mluIYiPu^FHlYoZCFYS}D=oV?4IkOyAevhk-RqP4O zC`R{XOzCC&NxF?4`=MeUwzNu_LVvOw*cc<$p$0bU8J$J*a@NKuEo}1Crr=aIj|&_@ zsWE6?`>lzhsW?NX&9X$nSd!w*Oiu7IUV3WnjdO)e{s+9vwVl0$R&s`861>q~8-MdRsmhC__O$^6&# zJS@(}L?BfiDc)yx({N^R$G?>(=E!r+cBpu!z6()Vj6IBlH9_|W9ju3{ULu>0EEbBT z)sF=E3biz05gh>)ZpDjwrZ&N=Q#F?QFg`O*a&D8esdu%>4&@=nB4j7LaTu+x!yy7P286ZDPso;?mv%6vT?M*73shm9-H3sTpo$b z&j=cyeGhgv4^VE><8TosE;((0yzN`Pf9=t2I}r@w`KOqUH_w%T7JWU0P^Z!nZi>xY4504)N1wvb)|Hvm3|{0Nr!sl@$ROr6x_Z-@~fKx zhL39BfckVdnVo5r-2mOeE}x@OdtGaH^l6=K5m&{J$KuwZ9=k213kRr5fLsnyB3e=f z;YrxV5FO1^-heeO=#%ytbsbo!cO)U?v5k--AQ~$niWE_ zn8pKaOu2Pe=SZ>9{I8d+YQee2C*Vc<7m{MbFY7*4u^0QAUh(Odq>l0RLMoJ7Utf|7 zvwz1Tbqi~rqLwD!@#;K#_hXSspX|0oEL{)xTrbn+bQld>z{VccV4b9<1NXo@-M};n zh}aLe*G@M0dvnPuhP|8UD@lX2|1h%~oSQLoM>;r-SC;9*D({(Ke`t(g8U^x4xB9M4 z`pi;CPJf}kBHRvI4Jp5rM3Vq9L)H@~`IYWmy4GzV}(G{0KLQa)yUGwRE{4aY41?5IpeCUu_TYGnTznR~Cx{z3mB}~#XMen1&p1AmFKOJj$Bcg| zoLbp5AdZsZ;iQDTJZiBx3cGiL3d&Ai`Uq6SjowJ?*8fuT8ow%T1RUeat84Ee;6{5afmh0Zq<@8gm zYIWySetKG*ZOubCURypg@tP7vdr&R|*cm_MWF(YVJf^bMP<2}M9tJp0SdckdIKRQz z@ToR50ku4mc(YTl_?mvc^-MX?>Lib=Ws&S|WwY)l@O1kX5I28St;|ngO3j~U>VM*L_C*t#G?LWct`=&e1it8&gKdR?z1LcekDO}f!-Lz4 z)bSY?xpf{pU)|Ig7dCA@T5>e|qT-cH9;8!}!EIfrm(VCs&u?kQc#G>Cv&X(j)1I*1 z_?PNbXUvLo?zL4~)0->o zJ#f@5pajN0gHz8{*}UjLKVWr>H~ye!GO*((xmx!d5rY@Rq)EXZouR*V<4dwBpQ4CI z{G}l-u^suk^{!_EL*>sn7$ZBk1O!`fQ?)1@TA5NqBER?*I0jN~(7YtR#*sVoBlO zxzL_uwG2RRR3YZAb+*A3ff^wslLf1qmOMKhdsjvtOnjRu6%5msq|p4$y+84X47~1`p$@n%}(l>cD;`ZnvB1f{4GfJ z9ObE#4TWa4;;U;jf~-Q*&RpWRAj#^MvP=54;$>o^mT{pW{*TJ)`!2l2EDE0SMg*r# zS`B&J9`9BvJ?l6N$}CqSK5G+yKm|RA3c@w3b0v8v%BvBI~E zGB1QHXWp?pi5VJ5BIx;|2|fYnYvmlWH<497JVx6VOIAk@r2oD2+D_DzwY3BvAMLQ9 zx2*uZsk0FPCcajo`g5GZosQUo9_H>2s{R;yb3B!&Eh58akld<>>53Z|{cO z2Y9eTRk}?(U#(-NH6qQ6+Q*#%cPFf2JNS4fm@{*6t|NxUVfdh-&G?Ib9HLyU;+!u- z&-Nz5CZmJk$pv=I25c@4NE=mQB;oV$rj>i!3+AJkRSO)A$8c>!hW5e!l{OR<=T=Rg z+=61k24e4g+iM&I_X9V{YetEF3xx-Rkic_pg zMt8EYu-efOGPQ@bGmn-K#Vq0j%RM@MA`e!=p@oQQ&ls_vj^b4-lb`kQ2V7G{P_u^L ziC1L;Fg7C;Tz0fSKVI9(x3-sXcu_OGCDHos3g*t38@uO#XSxh}zaeqEg4v#N?RP^o z+_I4FHWVjwT=+g+zFaNP8+TmRw4zFNx?FDR1GSoX;7m8Wz_6?E2A z!JD9+xh~ul+GWA%Qfl%u`^UIPLfVOZ;!`u>W=QP}%NzXCVFM?mwy0%tEiJ8XLA_5p zH;vMT%6NMKi+OzD}^R|=qwe_2W%DsD_Le4Fu zxOnw8@GxpI-qIC6fOzx?y8DnUYzFF`ih zJ$(xmY{bHsZ-|K@*nf9o6gX@{9`7}h9m(DqFMEh^=k$?F$QY{%^T5xVZ`?3z*!6Zh z<>w=l?!2nITK*_nfGYxB=al&6O+*jEjWMgg)v!EfDOmNpBvxKpw-e z1-ekD^n;F_YppG9cPA?+a^b4~*%6L9R<>wQJw@bw)v1nx**2@4eIb?*WyF#3`(D;? z;je}2GDnb=gW9!d*`fe5{$f$@5lGk@KE=@~ zt3nJ|Ib1z?v%Mc@CoVs+f!M+GqjnpapT?3|E%3a@sUu`^G0O^?cX$7fNGJ^zDk#Uf5A;F%Y(5N zu5Y#>7L2b)g!X`Uw!)aSyC1O);e5BZK2H;zwzvK~Sit+wp7I}m*MJ+ixbzl+mrh*3 zP4dS5B_7O3amsL!fi}Si2aaPb4Bgw&5yQnHB&0j(i)J1m%B|Pg+(Wew( zYThO+R__uS%d3BHJ(nCk*B5-J20c+*VP{n36k8@Bb5pob3zEM4?fHgX%UlALU5G}K zu2-oM-_#36oi8q3dEJKevwv}vnwQvQZ+=QW`{Km;_(>AOse0i^gP+n?BR+`x^qg== z>|2NVreG#>mO>!KUoiU+O#t+2SH%-=M(*Pz;5MB#lVK$X7;KT!evlxTv+0^ScNk@^O8wQV|N47uE>z%x zS5L{PNB;R8o+w2B@=!MKj1`jJC23#h>^oVO^B*6VWdV1cOF2XM+O>Ot7+nW6G&)Nz zBDt1<-4|CdW zlfge3DeBAXQ@D3RGXBelA^qg*51(q)y3fB+vA4HhykEO}&e#GsZ|cALWgR(%*IW8o z$OQ4mpwW_{)d1sj}&(HI~3q9(Y{km_#{q z`W@%a&Nq&#$uf#vTl(w4xf&nt4`#_xB|+?>CqP>+cbk0_jJobzw=?OVcIO6`pDT{G_GZx6iCy5H-g7I99As3-p=0amCO3)TJaxN z?A-MbW5&#!kc)tR+$~ik6?P;9S1$7Cbps;`6!JN@60S?PbfPGznG}Gxp!&6~7KWY! zWIW5j9O9trCrA1K#1VMwym9e8DI&_Cj`chSsU>&(p)aD^C8IlC3TgAIOQ6XZ#7&@8 z1%tI!&Zt`sqa{KioZz{1J&6^;$3YnfmG{uYz8(>UF};W*41d&II9MGL`}V@fxmK>Ec7bU_uqN=pD@n*9p4YhMc8ey+m@Xbdy^D-Ym~D$p**e;tjpQ`vZ6RL#mm=^VPCe)#EVS)$#7az;pM>3-z-b`sk=j~X zRzN3EtL{ycNa3qUePPDK6p%@*rf=-_JOICko=ixCHIxv`F-_zU%LBv1t;9~QZGf+q z&<65Ns&J`gIlH*=6s-$|@(M z-`RZPz)4~yur*@dF-!PuR&Zg+xlyhy)SR?k4RR?I)WX2YBr|USi0UqEK}AC6d)x6L zT1KnXRLbURTA|%gUT?fdFL`Na)U^iz>FUATn>)`a*1kB5suA3s(mK%-TGU@}6&&SU zCwOz6peDZYO?iIil6z=0WT=l|PED>ihnfa&DzpX@H6*heeZa$-`ZuWzYr^KY)4nK)PIh5mb4qgi_ z-a9A;VZ!_j3NpLjAyD2yq8C34^$#}7sF@bQ>PyDRk@7kYOkeZPd_jF*+<&|14U5#c zmmslHa?ec-yPu!pU>M+ph!9_>x+f@NN6*9uH1>)&S44V;+1&=vRdUs>+U(u~Pv6wk zhKIY5=Mid9Ow%F*QE~i>5_x?auvmac_ z=+XR#mn`mZN&N2Y-C%qG+PG;$yfhvp7&-x5T{fnw9xO5tgcT-lEe^CWKB{gXQc(?g zeY7{Y-13sfJAG(-^o^ERiX!sI`TwFhgw)@Pk|~w;$wZsh`;<*h-&mCaZWy}CJd_4f z^Nl$*N5o;|L#}c{vK!>|H+r0TJ`mbo1s%qH|NgKnDIF9msnD3pCJF;)nV^Y@Pewg* z*)?I2^6bw$bfo0H?h~sU?%EWZ2O4&2ysToU;y zq{X82lqw_e379D+?P9RUy9Pr68yVRTlPF|JcSPWG`C%=JK6y1<2P>PJ;L-wIym# zn`we`C~e@xH61sqqzLz9({TM4i~BU2tL9T8j#8IU=jM?Q4^G_#cFyRfKJ6=~=Ma7v zmFhR0A7i;SucP>1(y(s(qnO8Z@>Cd`0w><)D#g7Pm9^_l6$4C%neKKL$j(*X$pB0~ zsatVlp$Dd}C>23QkY~6wpdhR_)0-13r~GOhf_-|e&cwto%;6Y#M){*0Cvz|Xm@eF<+YFFBQ| zl~~|@tn_c;a3K=#X;u&mb?Z(avsM_SK;ZYhV6B1%`0o&ow@5GUdVlBRGAi>RwCV6!E* z>hE)BYe}~`Rg>rLR5xW*U*gg8*`(acu^q@-hzAMe-3B3zr zavFW(fU5IO07gwaXvXYQ0UK|Hwf`Df{~52g50Q{@*P1ptGg^Zka&Q;L-&Hq8#DVj| zZOGoE!3MQOvL+CUMvOd)`SY(v*Ud#ubR&QzZ~<$-KmeP8q=e@OuBl^KKC;7(ND;_B zAV9Gx;_4O(4rGWd0+SRJ>0Gy3#hWa)kq45PkvDGKNTl@eukPk&vdTfS?o$N~*75+y z1A768u{XmgsP6_Qi@5NEO?^>n2}esUJ?ucnovYj~_h-iWSA`@*@_Lj^q-4IY3U!WR zsqN=bK744*C;d`q0r~ll$a{zx7so3_PY%XrLE&=L{iIjCG~7~bH8F(9S9Eq z-O8f6Vn{afO;yjj2))w>$_c;a;=xig8CcBt6s%9`19j8`e-}h~s{!JtzS-(e2&_p)S3b(~+8kD5|uL0&k@Y<}lmVQEO_bc{+ytL2T;4;;C7utQ9@s7o_J|XWGfR9tP zSCM(DR|asO2!dxFZti!` zh0qXpn@|E6I1rBX`^@jB4T?U8w6y7Xm6L-sFjNKAu*4l#XGdNIp_q5GvK`E|GP6SN z&`51+WOgM@f}|J#)H@RD8Pb6dWvU#=#?cq>Qo24I_S0fMm*UT3{OzcMTWEu3=ybD-!Wec7;Yp?u9j$Dz#X2eob8iw8J- zkQGG!!~N%}{BV;@3e3^rMxR;=1!Ha7kThTpu#!`sCIWX*&oJUY2P296PUK2hBaDKQmCBw(X5m!6hAlXf3NgoZb zm9g_59loyu;o~k6MVL$>WR@M5>F?YmC8PbQUz!eao){zJcABg=pyz!|n`txGLtsBi z4uF2nOkU@2crUb^1oa!_Rb`^BaYl*LolTmQGCnci@YfBrki1+%jZ58ejlNeI1=#5# zmxS`s{#F`hh)QixE!j;`#y}II@hntU@3gI}xgPoH660E}O**B4Zj1d7|IO}tDr+7h zS=MSa-PVWa0g0R#>A-!3p*&S7`XQiP6zCL0+YRx`UHtp5@z3gOq51Tz65?=#pzMK# z8}sNbQ&QxPJHK%a8y%<0Z|8-pC(87Hd>E2Q3&gn;vmwxC=Elk_34}T_&elBI8S%(9 z`V`fFho?X>xx)#hNa3d2sK!-5>pk=|Kinq=Nt9C9e<`v601)z+zMZuzqVFvT=-m`) z)J>=SkB9xIAGFs2TriLF?u$7Q!THaZCJIC1xE7E3uJVsE_%vTO>8^DSh;-(yEtfWJ z*S9m0*pYy?9t-4rf}~Fvy-57ezwVFULOVoarSTdw+BWltG-;uk3CM|7xs~EgtsDhj z{tO}cdzqlKxX->7Q34g|@SWag^Lrc2U^Bd(T88{hsi9Q{@3D0dX;BR32|mbLSY$1} zjIJiLz%-10(p8?Bo$aAAtbnSP_-TSs6}lshyY&5Ea{!}P1{tKOUgdb+JF86J<6~pz zs%hT-TM7GfFZli%=;meXd5^hBPFrznfZw{e({$`4?LFyh^Wq2-B|&Df+E8sf@a?Nv zjq5!}S|zLx-E~0u1v3ahFkZg13OxpbB!Na*;Ai`*Krdc}quK*aJgi`H0J z24P7M7)Uw*LsqMAhTxtKH^IOUtRxQTYC2$yX?_5z?t%KJ=p;udgFqOI1IL?$t_EvEt z&XgtUZZVi!S&&-vm-&Z2*IF;JBg;gj3owG1``oJ{V2EhE#+9r)8Fb z5bdz)rQphxwu93r1)l)2*}y@Lr%3eIvWQmtr}XqsV#L5p)W|2pqGx$8Stt zh|#_Tf}Waf8Bjg~m6mNc!>=<$nwKgVfAeirn5t_dwf+?@r1t z-F!yMkZCx-IRKkQtR`-QKWwCA z7BH?c4`mag?TQDZN_(q?_l%f+e*}VL)Ov(Kl2UWHw6p1YDF|U1fu_L&)>h)jumB|YC3;*i*gazFtvKJq^BHRqbaByry5Oyc7L6s zJDuGuVhEv%s0%pS8tpBB{#es@ZrIYx_x=9ryqZ+%9%S*}aj2OLcKTFLs1C#)b}eE7EtHaOm@LMr+8}Jc)V7zrme?oR2MeOut zK!?i-xU*`>t^RPmD-GuDswKLC@m6Ts*e{#nU^W(%M0oj2`uxrmEY-9B`RR{Qu10^q zV?e@sc8p4wD+tPJVNe81z>eJx4qg2EViWpIY(k9MX+JWUEEb~_+` zM=emd!3BUYjTibdaGU*S1^c8Aio{sOJwEz!qAd*NMoW<2Fjd}OLEiM&Qwj=zU6aK* za0~7<8b%|%cy_0qJyHSi4b|IiEi^JK@85P}HXYx>T?yX zXznCRDf8I-$H1=`1k`1LTy7&>CEL4qcx^->B#A>Z@V2ASatC_}j% zX*dM=zRs2?)nBC{k5~!blSGkfC!6d{wP@l&1>s}*J;(p6JKqS8X`@fMrZl>QpB!7p zX7|kGI86l~Hz4Q-O-D0j4-TdQ6^wVA{ee(k_~^h2rt=DrVnR=Q@kIz>>NBuYPQGU^ zd{Y$NKorUE3!*3KHIYu!Hn){skKYYD;ko~=y=M_I##*cumqE$O6*55;=~cmpQje6F z@Bp^wzCZRFiX6xap=J6$$G?N}&VY#nGMZl!=ZfCAe4{-EsE-nidLW<)w9N;$vhCn; zk8LuhwQkH&1-lwQVWf%Z=5KZmo|g*Bgk6DHm(pAj?(Vpr&G_N&t>nFS@Twyb$FXV0 zQ+dR?gu$T}+v)Hf|Iukqqlu)?jz&5B@1-+LB!!t`t5GRH2B8FaAJXl^qJ3vr2y;-5Z{Z}6IG`oB^KLfvoh!o1alKs`tmZIAt%SR zPOR{#Z1VQZ*laVJhzHZrUu`l1p;dR_!_h&i$Dzj|r=(u5!vE)904xek+7$YpXY2TW z0-+LGP6yT_Us&Bk>BHX-2>E7=H)!V_Aa&`DW=RoySn*mV8PZ`ls#LQ9uJK%WApdH5 zZ0rrDMa$~0@-}d>#InIiyF$Ex88#I;P3hEcGw??HA5cB2T`N;gC|jC;4fO+B5KRE``8wQT& z9Qyg%9SN>UF@>c4NIGG9Y&md$^A2*Qw))Olk6A)o$%(QQpeeWGZhJirzWt+0BgS#J zZ)ts#tepeQZ$2W~?5l(hU^bChr&B`OMnGp0d>&NoIAAl-=&VhxG3e%jY_*NmK*LVy z`kvau_-#trQxs`LAa4x&WB$fGm-QmVD$@45-^!jQnM~5S3kOv36QgIxZ@6ip5W4~)K$s7Ip{m3G?skx=b&VhMs$JMIM5%*xS+tE<8= zzeovguKvs8Q_Pt;&C*q4j(k|*p3EF#wSjJ*8G(q1l4}}uCK*z2!R|3(TsBg!+F?WA zgNKh=W1WHakF(bgBeUPBu1cNon)yy?LtfkiR~vWiV$BasjB zp`@9tJizwu(pHz6CsSIa2Hs9Am>mR&)MPHrd-6TdmVVfzf)W%CRUae@VUhBC<(w^S z{*@>{18T6&|Gpb7rS}NUWNsPZ&B>M0+-f>H3jq5>zD1bJ#9RFM#3+05sWWP#K#CTu zq%f+je&)`fqZ>-OIpFQk1zLgA&sp2iUws`JAx*-O#KBT!vwmh&+k5r6E<1FxZ)dzY z={fm#SM7p=ChkcB>9sj%Yk4$*Ff|VX1S_9=HIWce4MD7}8S32{08AbweTjEk3hH=% z^`cqW6X6mPN#Ll0cb4}BZ#4(?-DTn$TgLRDgQSCJ$@fbRmK`c=F+$!^DNT))f49-K z$$>;%;PS%r+BC=S30R-+7poo&Fk!9xGw(smK_gWL&0j8($tNc%ZZo-wh;VO>4sZhP z%9|$KyZAU|3jhFxUhTO`$1R~=v+?{>nN1%HSKS+D96ePlG-KV=0o>AJ=pwhMqJ*tp z{+mv44bu2^Kk1zJT<7`=Iy3pA58`vgp1N26qW!l{g)0I#Fi{H=Tw!%!hD5b9 zlVKWy4pt^t@x7>KJ1nmjet+;pI_E^idn&6&Nb%%1GLAf#J$LajU=xe-e+-lFGoYDr zVga*^d7^Lt^58nj5WDOD)Ho6|Ih{sVl$ybgmjLM|LB~goDul$`+bbf%N2MU|o+K08 z2y;;@ChLS-RE^J8@4>8^ks^#luUh}93b2N;a{orug;s4N! zZZsqPILHHUO*)!fqKAT1h(f8qPO0bWxbB2OIHcbhNSK;w52~9hVT(J46)m-3*v6Sk z$J<6|XjSm$-*#2m{ot_65$fNQ0asKe3q{0Z00D3xx$8PDt368e7I3>^JDREs-2qsq zRY1|+WBajuC4dDqdK@goMi6=-qHtol0PrVDjR9dC+iCA6^W_Kz0WbKHaL{Bfu#43) z5!%*`y}wfR^nMc7G_#F~U$5>PuWN7)z5grr8*66}qK zG;_bRgco3Xub@hlIv&i0`lZ4NLM=lnE{&KW5F7em)%(ke{bc~>Ky+;=$+X9jK(U;J zR=pmsx$2YxF{DcRH+h4-l(g!`Zr&JA0vlxB)|(n%Tv^XMA`P{S_f8 z`=^P{1&;xd^scb|od#ttZ=Dhj@KY5O29UGUOHh7C7{%S1G1bbHEk+H%wWjOh5s{WBm0h_X1Cf@EGUVWmYA&kfow&+ZD#Wy-RqwkdStFj zx+G$D;$RM}k(ep`cD33DkdtSI8+Zm1T?mY>|9(Z5zJ#D{VFDpt>8f>&ygPi@>UXLtLxv_GU1#FnhUBp;Ng94rF~ ze(avdffP0LwHG+0_kD2#rZDbVyJ;mr_j`7D{&3GOF9vOmtZx1x-NT<}9QYUPSjQST z0tpC8^+-bAk2rK7QGIj>aB)EsfUtmI-<>SfH7gL)3Awzw3)FdK^Sf)agp%S9*5Z$V(k48da3c+O;1Le2&Vj1nq#Ma1 zH3v9=7u+uDg~aKH2ZTsMtuxR5Vbz$gp=MDzKI)R3odJVNy;)eS4{~6 z2icUcpJr6&RotB>A=1B^hhiSiVhkMY%Dnh?4>T5=s;D9JALgCF8fb0MHOC+BNrn1& z7*zmpoAF_#FV)m_v_WFOZUMCrh?RP=a}~4YZ8qcyx)IVK18_}g?9M`Co0K=uKnQo{ zfYiR72>iD@lbk5>v^*qx7cN{FJicNt5OY~l3lgX01?oIxksT*HABZ_6m{*ZUSV`md9*C9xn_l0!nl$1KoDojj2Vr#- zG%St_wR;OGs=pIWyH>J{4((o~5Hu6e-m|1lykyLVS-D>kJ1wwLlU$MTa(v-E3{*}K z+_S5qMm7aEk&Tb^__Lv)J|68fc5Ec8GvP7L(qy&96rlNnrQn*E3xGg_JvD_}Os3n|5O_yaZX5acz_En8!RH6FY? zN4;$7+=(Db4!u8GJm0W+7ju3Et2Qyl_DZ-wLCo$In*9hZGIdFlS7vY)h0*tmga z8)}h7M{@ev_2~xYfh0j2f<|@5`jzpJes$rQ>N%j&O&HY*&`{ahKk_ETn^jUp2T%|H z-dAqd0>8UFHl>5M-G~GPwS|yhq&B(*pSYZ7ShfHqB^un?>{?M9L1wO1CsjFfk(&*; zY^NPkQ-idfgBw@Y^+&9R%-&$(_MH2R=qJc9Hfb z-eJ$0_r^|$WgWowduosN%EvQ`o68MT|1AmsZ>NSm#}Ve{D!Sl}%>=_^L0M z9Na|i|2*&i`%|_JGO#c=92ptG3)&$ecda`K!^S?`0kJU{)&8~oPC(}avMNIJH99D*!x{2Dc8)8}xljc~Ur zg6eGm_b8Z37oRfmM$N;VKr#j-47oGBkhIvqZTyO46%d%fw!H~bnD6%Qf)v9of~?Kx zJzi;d{_RwilhFnqXv|=$x)t|W+ziCQ14^7wa9skw9df?%t)Xu+LmMOsy&#W3>>?J7 zQSI{Og$l)WM@t7NZaW%heKJLmoRp5Nzt&e89`-}&R*&Pn3BuGjPV zcs%awfxG})`-`N=j8yT^lBY8BKm%P27OdyVsSp=;EgBDW4;i&;hZw;8^8)oC|LEMW zO1UbP@~F7{DZuwAG4Ia)9^qbjQt?4iWM3@%pPyg`W!ah&QEOW^TONm4T*a z_bLi;ke*eLD0~?lmsD{{!CE@6(Bn&>{+NT}l9ogLG9a!6SIw?IBvKod{At3_J|v5l`XJ7VBYNoI{$xyfbYp zHXdrbMG<0YS`BI*k-C%DF5OiqUnM`rs~9hordDM1uIc7+d}L=MOf=1hcU#YA($kz2 zWkE7*rVJ3CL`eJkj9*k`?cX4M91}_G0eXip(8@tBLB-K7@!O|kT1hCz=u>uAkJG`sAo0-Q4z!7{J41`0#Z%uBQ6ui(KrBYSH381BC}7L3K03^b)KU)rZv}Gz zQ@u+UCJq!e2^!6RWyBR&#iH zgg#0~>G$WgVKThdevohNRgqlAl~@zImxh4a?)TZ4m0IOBsJRJ1lu2JOEyq`p6Jf2& z?&)w7K8 znzyIkf9VT2xx;vpt4D~5iqip!FdKSdxZFKCTA4?_em%K+K%YD9YfDR7T&Q4qlb+zK zCdtf=cvrmuD=WK@J3rPyz*t=w>zMRLvK0ixn=8SeuI&GD9I$Mi8gBw*Ry9C&;%Iwj zoiQ2&t-EJI3Ci!WWH#PjIO9G?yf7ZupFATX?n1 zZhZhJj8wLmgf3|zMcVM$=C4QCML*8(dYRX5U>F-E{ z|K#!KVWGFm=)geI^9|^a7&yt~GxPJWVfeKs_;*@Xuzo^n=E4)!GJ7YYjR=V_l4|W{ z5+a7#oXN}pofM*#KEO4vTZ9vCwIyVYf$CH1I2GCd$rV7aD5rp7{ z3a1m2JQY#NAVb_-@-{n{s~X&*xby2Ul0{s*yGEWsD;16hD||Kp1Zxkvh73&iU+u6S z6g+<=fc*hAgOOClyB=^4jV|{Xthy%1B}n}IE5JmMs}uMG$#uKKiLbz4nHXvj_hbZK z3Azu?8D0@sFEQ_Y?*;$Igr5zuybOG*@9vOxOoQQuri-Vwq%z>i%%u$~U5F(8`m5@W zai)xM-O<(7lA239p`&|QVAcUaMMHv7lDqye&`D0k#bg0RfQ}kgWDrfEzjyc_c(mf* z!lUN~>{n)b&j9#ByT9+4lvgh-#bP0;kNPljx_i;fp zmVW^<6D_wn#ciPkC=u{r3c6mi`9sln;D=ShKzB1U2Bch@LK4Px&60oEql&4Oohpd|dk*uq^mLfTa}zEOk9d6yVy8hNF00>gVw898fOq z)AacHB~Z33B<U(k-n4s+0u7e3i7Se=e!zyHBZDav>40hU$>n=3l19VDg zz8b!K4%s5Ke!ydDBFe|eyXUaEoi_e3Jw5$Igi58v2X5%}#>MyV+=BiGw9%&c$8ULR zMBa^}*YiPaO0kB0^N?|YWRXgSk>EoOQFY<26}i&p22LZ#GKmqT^Kb5Z!fYs)7G^V< z39)+5*>SIqN|fzMn3QX@E*m(9khk0BE?mtHqlIVL*b>Gdct-(ZHGcndeZ?lh0@1Y_ zr_En5JmT+TYW56Oou7t~j`G?w71xO{n^y-up14_dsl?H&o2yx2jCUozl1EifAf~q% zx-IPJ!yRXjZ*G2@Vm|i~4CIi0_fGe|Qbgvq@E}@KBg_RD<`T_p)l=zD-&FIZ%b%X{ z0?C5FU(jEjKF%Q|B=l_q=A+qu$2a>$M3>F9ucb&?KTzUZ46hZtRIReRx*xeadHUBu z{hx>h$)5s9aUsd3^b>WMFy_(h0DpSw?*9r@!SDa=Jd8&UpQ&z%?frJvuWZN%i`Xj^ z`Bh&Gjr?tb`-{hZ#caLPE26jGQ=3{o2cGI9D5`Js$u5^$eBv~%I&|jB!vgKHm`)WB zX!tr#2=w)!=vwMYzTtN-W3`Zwr8hnws`uQT5!F(Q>sdUiD`&Ib6h^_RZ-h&_InL%# zPnsOfJg=IVYhf0I(lOMCNSK$e_HGD36sZ>y4qwtP)Ym&MY&D_e2)s`Mx7M{w;&l6s zJFMwS!zDQw*?D(>`tM>wX#tW;5_2%;ri5k}i+Qnxq{6>%ulT2#;C*0fj0at4z55k5 z)CYJif6`9vH58oqT2MI`0a3ai`Vm1gIaFIabP9s;&a4;71_(lTW~?F^8lUvhhD|#| zfsO%?O-H?HtCYB!fVc2u%Hk+m!0NU0vaoB;+su71ucXY((w9x5dA8dWTs8)VZs9 zx`QxBskrs4Zd^;gIN3A>!d1f

z1cEv%WkSw>fVC%Dpvh_Ieqhk8cWs~|-a>%k`& z2H!DMC`4k^bK&niedZnKFTkMZnb>r(Ohy)hDGxO_9Lb(oF@opXnYYp##3L-Ae_B3~ zm(gQo-(yXLj^66}aS@xDE}>lY=d5WpDPBJwofLPvS~b2ZwVKR_+o-DlA+^ecZ5{CL zcb>1Avjj11DsiM&xH2j+xiVmM#(BO-@E|ur#fa2egWjS$4`2?(n3|GTcNKk)B;Duw zq{Q)0-49jP9Zs@?d)4p`qYZZ~Ih(II0RgB}W|Kr#nU8&_(Jpz_4~d|y)-q%+LQ!b` z4rq%-T;BDHfzQywzpG}{eIU|#%7PR*DWPYV2flP>pi(5;Uc!0G*#D>CUGr}rp2!Sb zV%xEaeZ><#U@va9Fy?cj>k#Hu;oUo5+i6(G_f7~KR9wFS_C0^yF1Y*yrsT4k=-Tk< zhPxwY6s#T;fpd#^ZfE^GMhOi88im2{kWP`FygN_l%=*)BJ|xY8e_%2;(qZxC=1VU9 z|A2C0Tg4*IWsf!{aed?y-I@4c+u@& zDb*XHFH*0rN^c(|XQg3bIebqhB6ipAa-%)u7aCJpEZ=$-78H!qbm} zUC*gqR;$L#ePc2!Wd_JHPQgORnosbx{UQtgr9w>#cFDDGkz=)C&x1+(ypSPcKkPP0 z8Jk`J_t^8@G#A@I2yR$5=wX>WUa>J(nGaKS=BfdRvkj}@ogn7vyqI|+$5aF@7sagR zpS&j+`~(#)Xjg_~i&%a2BhW&A1~Mji8_SVS2S`>6d4k#c>lWQKC3)!jsL#S#rt4C4 zl#R=H2JrG3Fv9P5s7lTWMiAGh9F6+@B47^-q4or)PCa856h)VBPvqiBQ*A0H&(Y`6 zmYvy2iTjTyTlE{6g6v)OCmO9LGE$=1_%c^ZA|W3eS4_(hn|rT*T$w2X3jZAM8))t5$|O4<$LM?HoKWw5%pX@|4qOgzxE)CBGUtbZTyrIE zfj7U`+WY53T9_`-ApBdRA%Ax6*3N?JTdHo%OZ5KbmMDL8Q@o$Nyp^{=@&3 zRl#50!!Bgq*L6VZr(ssU;U0qnl1ptY*{{o38-yL3F4ULvyvH@0LDZ5K(0`UN&B8_3 zio0b7vi=otNB+FkuSV3%QF(k^=;pz9FoalxD1_iLtGfZAcc<4MsL(n+EtQRdL4yd8 z1ns~~OPKf+yGUR$ca8zz82P~RyR^wr#QAHZ`I1X3@1dt{)q8k``-J9)YKJnF;tscG zK-uMWlW(npA^xG|m&ha6MC+13qTeR@qBgo9xHjc>`;iHN>an<*1=D2rOw3VHKoF;a zWlT^Uvdr9#TdMR1yKB7va*1*>|0M_~Y8dp{u-*W|^>%U*z(AR^XO=y+L7z=5wTv;h zd=2%!4?_2h)&oxn%7Ig+e<%`wiC%zftib3c3?iTtLKd3kmP%XCatrva-Lr+J<^xD6 z>8dg|jzm>8#h58U49b6)SO4YoT-8EvzIiS`IrU}wQrR-DS8`Z+S$cA2D9rE4PoS{s z0}?aOAM#Risv5RKA2`wnCYGs|GvZqwsXmla9c~bOHk8I!=}1lkU={aV>17fHk1N*h znZwNF2iL5Gk6)Hipd`q-!_I_g5}VSXLn${Bv71?Zpma_hfzKhbl{gq5DI_GN%(wbLtfd&8kK8(3=>|w;5*P#HroskP<^(Rv> zz8m3hh1m!914s2IoCuFKgyge|KC&;nAe-ccE;A7!AE`~?70z^NZ!4^#iedIh6Xf$Np? zGjFI#!yL1|FMOvdHnt|trW_2Q{--GRAE)hFeBm$_XBzM0^=LI|-7<%%uzE$e^5@s! z03xknmTeV0f|*uU;@#jVD=!G5+1oY5u&kZQN@t>~(H1_~s>oOUjWn?T{?Imfhcyl+ zrdyc+)L#PYDkvN|R5>JO=uZ8L5@t2H3s}Kiz+(}|O??wMn*LW3@qkW{j zYHH{2lb+Z7YT~K30YP4RcO#~BeVy(`aDxjfyHN)JKw@n<`~j6o1qw zx!@r}f78y7WB6Qc5aF<OKk<_k*bMfC%+rYWn<2Sik>#3qNe!1k`}wpPo+>TBLt@(Pw9@L!b`4 zqOKK?D)#~utXT?>eRXUzRRer!>9yQ@{948;Y0NIyD8>U#c*JLRz?{Yiw zl{s~aIeYJd!*o{uKx!}{WKovE%z@Kqnv?wIs3z{qWOiDpL)}S`KRaUO*S4i9n_z@L ze22hcMqf*aaPgXW8hE-mU~ODtBt&2aN`h+^WsunVMA{kBPhPcECzq2;lVXEv0xvU^ z{ooI*)vl9R7p4&%-c`2sbVz*`xQjAKUFft<8QT~eb^ClPki3kBf&Xka8&-|wIxq+Q z^Hu!1vbL?L$jMS?gcvnAWuSFGeE_x8*8@Lryo+^sWEs~ z5B+B))?q_3h)D#R85E}ApPgEGzqPd4sGOZeo>+j~ti{E}H?Odp-0y?#hIqx{`*7Wt zxq?A>Sn$dW#SUCk^Sf~!aeQS9FGK&+YyOwE2J?*cQJS_oW6r&mu$ZUpgK2<*2<`1! zh`Z-B4}APhS8~B@V{y#Ku=zbI3n`GNR4D-!{#5)q-*CMrH#mFs=k&kOA02m3SD-9fV z*`D{~FL0h81ntlC9VlE?07`q#`D`6&c`XPREnbh#*SFjrRP}Y{;)~=We*laFA9D$D z6r?YGe{J9N*tAq+O_2qPOcu3mZ~I4f%Y%m2VXZof0_Rx zPL`vr$@I%d@NzSfv3Wzgz5*_Rdwi7@G^#-_zgr>?b{_mTbLzmZ=L0N}ndYO7(~<9a z4Sh@DwtEU(M{bwKK1WY^i5+;qBL{<8td4^Z`+eCTlLU40wD&6Um%3Oxjwy;hd|k%q zG4ZrU*EkeDS{KXqZmougkxFqQw#jRE^MP8p0exHm^#z~CvhSjCP78}nZo__GII;uD z;JKww?=Kgzt!2-l!T*KcaH+ zLfI+*>Aq)k$)0oeagoNksVo`aPr`}WzwSEo`9gM-W(KQ?>ig9JENAwY`(AgW z?;Mx@$8<%vjbf;HLl9U(Yuj*mAk(@V+_@&c>zyg5`De_C)sTq&2702=WCNX|L9fN; z5FIC=N*|Zru0OB=C7oZ`W(k8*MEidt`^Ei$2^7)|1{6G~irwpYAPa8alzUFqayNEN zfPQk-2(F()Z-i_I`yn^wj&3BM<5<(e^-B2i8Gb%NFZK^AR8&tU>NS3brCXQCL;+RtEC7)$8CkhFGs&oJ3eR5`)~srTYq zUmcV6*-QS#Op!kd4Le&uWql!=ir}_jYai+chk?BRcraJqBEHx_c8BzrOw+b3=XuE05+Ey=!h9{rEOic4+?-UN25 zKVOf;K(E4S-)|*f^dwKSe_im64D}2J8;SGGg0!~oFpvlJBx+${7HTr3RzI)eL*ER8 zIooQa8Lzd5FbHBU$7tBZ^dP5RvPLbiK$}Igu;k8rM2hM&q^9}XL-fB%<@eWA`5+PD zHJpGeNRWKvdbVoa^1hyc1fN_@n41UEtd!T)?wo&ogcmT@TR*!#p;Q4+z*Lpze?b&4 zO?URPxuU26ZQ@bh$rgI10P&pwcT^(X>DcKQa3zl;{p~k}Z{^_EW)xozEVj!r>xhcJ zq98GEK|$5v4V{1RT~0cAe!SID{QdL^N&Tg*_8#w7k^Uhyka93FC!YkN`kbjlfQ6iH zdBZkYaj*WnvzQI=pwE|P+gm4yihcHQee&9!Yj?~CHzo=pPq^srkjl$ZX5jFVb&koq zzM%t`f+` zE#^dkuZ1fiLtO7*%DUXIE!8&tpYxYRMdfQFf&V4I+kNq3RIV+AIe08h99pm`eQ#$M zu^w_#jQ{JP(|GW)dq@7{>iNO0nh9E%UqOiBd`5yrkDzr3B=R}0lt3sTJJ zsiUS=%-6P*8mhjQwZuz$5|op~g=({?-}5!_)}0QPM`3cA%+^I&ua22KSJ!%~6a!}7 z9<;znWCl#NrohHnWoBI!9RyqLG(jO37Hx8{Tq8b%IPTN8ektk0dX;?QU&tqCV_BN$ z69zcUWN~6f#g-n_PrY+zttqm0Wy4TY$J8d9d}_!ga)!iovEbt1ZtcV%QuU?CAzAEH4*Lrg z8Aw)bL2iz%Z2dOb-tLHjZB;*wb@|df{p!A@NE@{_&1-%d?IUB(H4_VefP_PRrh|q} zy`6K+yYy2M;eLm(CM}u65jRNp$%C`E0uB9GchvNsVV#)2;NLpjfvchdtstK}&ATI` z39rB=xM*#)=gQ7d1l&6vXH*GG7dcge{{3D4?|uz(9ssRmf(8C82@xH}jH|$Mt1lJ1 zYyQa-`5P;t)P-;HKWZxV$KRpw1hb~Tly}kM0(~|-Ue1#p?VL0#ycRu_T_9VzWXb41Vfm9 z8C<9r?CXu((CGrXgizTJaF?C+^1nnQ*@c)x4+>E2P_4u8*Q!;hZw*9_eB04NB$}Fr zATcU`3piEHiDCl!B7Oe?K>b&YDuNccsR-2^CyDVyMx>u4G2O;eqB$2a{DwSacQLmHWBeJ`L#@P%2}>h1(Bip@SIE zhHx>2Avm184QhwB;!@wGp2D+C*CPp&#Cmof}H-X$e zz{5!vOXR?CB?lDqF3$}RX`RsOxLkwU8sZNg59%b9B=JuGVvq7N+P^)7wxDW{Gymb^ z>eoe%a4UjVE)W(7NcP4r=dd?JS0f1~ve4%&cH zP;vG_uVMxK<%KO!e_2p4dp)=8sj4{%y|2s|^H64p)I!jh`fv!T745+^_tF(`S|CG) z6C|wNq{}j0o%jq)F#v7 z1~Kg`(3KHEQfCe2&VKxZ^Nk{CCsFf1`~1xT zs)8JKI7=Fs4!~BAlvwfrMnKi_CaRwlhhvxk2>-y)*QxmN@Jk1ZAnl6?(ECw)y>so0 z=F>vto*d!Nz|<$UXRj=E^y^2(OgXR%>dgI0qe{-U#u&(0+OX(PqfzaUl}sS5-&q^@ z&pS7V3in}G#`!n!l;32DAdSR5y()|KV}$foo616fC(ri}fGNy~Ey+o}Ewf3np(Onk zqX~`r!FHJ9tdJyqo!zqybfN?95Z39BKWlPS9j{n!)mOsdbzDOm6G{3+6L;b2ZFa6m zA3@i0P<08%Dz;`}o*JWo%zPLrWV9K~_4wO29n^=_%GotEpvO2z>E9tq`7Gl0()#Pw zEbJ;af6^ykfbX5@%Z_7;4!s0rriSA5R8uz0N44)Xe|DFK7h@hBP*p$JEWN!z6WCE^ zYv9@s66WyxAIdoP-T`!QZhaqH;z1S^BuapRd-!JDV}ujC`T&gI2c}d#@An}Lnal>8 z<)FPP`x|<+Q_oC+Fvm*nE8FMDU}@*u>X|&zc+ps{7{KixpXL8QFNg z_`cP^IT}pJp=!-_F^PGmo)I)3QFPl)i99$1iQ#56?e?4@pJ1O3YdzXkY@(=fH@pGBnKJ-{5g<>jtU0!roWoMp2c=EC zCEL$|RkI-ySYITAOo3tmCjBD1uyBO!!{rrwbN~2Ro7{(+*u`qnupq1Rs3Dr?gw?xD7iN6Ab%Q31oFk#@ za>TqO_p!J-f{D)t|Kl&g1nLbyySB`)gz7F6mbaTwyRh%G8xF+A9#N8F-`qF-E-u?BxE?Y~S3 zzi@^d3D1ImbVJN9`b(l*>Z0Ln5i>)kh?8==+X3&~FG(4xF5H$p>r8l)Alf&^=<@lN z%zF+KqmEWsS8W7ic`4LTRsp33Zy%VchMCB(qDxCuB@?c+7?$T2!0Fj2RvOPqd<(zi zZ{u-gXQ|jfKq!m{DXTb!gXvN6IaW>|PZ2!>NF_?Ok@55~7N?~6Ms~#H`PYD=mAQe0 zAK7pLy#Och4N*76-Yne`vs-XmW`-#B3G+sx^s+iTxN<3{K5B+a%!yip#c}%Ct3tqo z_~M8+nNxTa*w(hLMd;Hq{t0plot+W8>J%V@9&*5ezX0b?d2f`IE{emJn9B1Cm-Ow1y#2HC{!Ju>~SrzsqC~F7|2`) zaPrghzP~(e$}Zoep}1w@aiaF&Bhj73j=ztJLOG`Q@U|C+hY=7aY@qbUtMcL0` zAuDr((_N1U0Kp7Y!BRA!ZCbkQHuyUWfZ#-_4uq%86=of5#TuT6`C>GY3_eBR9s6F| z^fk=+!U7P-O4czCFq8!lK+M0#ne*Q@u>bDQa`w?y?~U*3PH7*6w5NUVEdjq!RVh=4 zog+@R&1D;4sbC86ZV?+NTdvZC@Zj(mqGPoyJT(Qdbu`Lx7T9uu#d#a}u9f#EY`=M^ zR2`5)0=>zW4 z_=}YWNP!-gCqFup-x~u9jykk&hXTD{1{zWFS>)>{wwZHPOr(fAPmyhP8f+U> zGDk>q=mt9eS!=^0G2~y1ZvFEw>&l zVNLLT|D9m#LlpLnBz1uY<_^@xhz*pNh5%**7dc5+AKJ&B0G+*DqcvL^a2x7 z4bYY!fxPNHsSq@@2jC)*mlPv9890YjzEO|47ks1L7fveSEDTJ7JV0Kj?59|BuU_N( zb*Ja;;8Wu{iK9DkEh43uP8l3tG%*Wl2CXC%Ck+FcJk0tR(uQvLU#k^5H-!bRIu|O| zG~M9RBlWUp>SE6Mb_3wntJKlo#$N~~NUAz}>W6RvAzQ^SY+YVa&f@KL` zoj*Df<4lt$2g^ufp)rjfUpV$BqbC}Fto&YwcHRmuyIHf#gn`oZcteI~QB~fqG)@1I zXl@;K_ST%)ZV2QwLpOcd8eK>xKPWT25(51G@DA)i`U-Oo;=4_q@us-j>mOTFL-hZ+ zqBrfsHTSEA-23yEIH^MbZX z#?$^8yM{WdI;<&xGRy3ki}#OHYJ3Gg`X*6+N>Wz+;kQRrN}%a|Ilc)9!whuxO6ze^ zLgvu%7Wk_54k`zu1Uku$nHP1APCytn#EiEZ?fUCa23^3Gze_JLV2t;YDFk4k7zEUx z1XHl83IP%}%FYs_eE!{~@qhi9sz2<@=9J-!l#^lUBt_YwpCDmiKXeyFLpSUDE`8=% zkTNk{qe4|eb9*T2;^Hsw3z)oN8Wo-#P)$m%&0Ra7~THDNg{c7j<-#<}R)(SRYZR2#&nUT= zcXCx_$fF!Rt%!Ido(eqq)rVf{*QVpfxb!dc^=;_PEs-p@@$}IeHooPmDhls}@np{e z&zN-rsJCZpPHeUK!X#`T6E)>JM!_*LBrVOilh;|{1t!kmmxi71tzApNA1nztO{lJF z9J4gM25LqfjNhS;R_>X1)8OSx89gW2z2`H$gG}w2N64wF0m8p9Yw;as;8HY6YiVgI z)oSazzf0Y%hxEzNgoRE{p8&;;G0-!&LuG`frpL7phOUomg9tE$$g4uaD!u*l_T}d> zDHxBV$KriO@u9fdS{C15n~6Iq82(Y*IIy30H9<6!tTx7v zgfg`~%wN$<4HIG8vj9?n?=k)(K>5$}v*r{Qe@|D(xy#kbQ*xsuYb!t{>Si0rX!Rg8 zU?v{tYyuYZ6?k|mQV<8|LM~y#dGL(KTvZ}78g=YC;22tg#LMpmfB#RIX?`ny0x+i} z(~6nblr&L~N^%L-=PZ?#-v;EwpQQ-U3L<0<&Ae!pa`cmx8c|QhAKZ^u5~ftu$jpRr znTCNvm*!P?9(6uFOvOan*>!-msDGpLMGpIk9{8pc_9{Qmep*eQTg;ed(6{7VcB)aS zHlSPkHRoCzERocznmI3ZGBMqCBp?#^#1i1jxz+1KcWJH#Nf^dIo~)rz&ay(_J{4$Q z9#5@m>vX{in<*`@4CFtI8eNuH3>aOVPe}q1W;&BE?ZU=6wAQ1U+>$jHLH?G>lSKRc>5NDpX-f=@hc9R3 zAZWk9mNJxbM(r1b1DEGFz^1RHVclEWC6D7tul^P^@Nat1W~bq43~V4j+@KWgk3xFI z#1y;N`!ZLbrg97P+;mj7Q*NC_k|DY(njw@h)Vbm=76trP9jI3`@o?|<8v!8UOu~wo zO?2H;7p%=++TXB}&NJhsb~lzVlncoMR7OE%rHSjAKL$I!Yn&vvUZi^{jc;_#2h*%{ z9QBkk{MedrdpVl3A&8b21~$7a*kMp2z4?U(?VEKiu%<^|(g&4y(v#s){zk9o^aLAF z>@Ye1*gx{MY~9tqRNkBX3rXfxo7-Ddg>gIaY8iTXD|3qBwL>8D-N)Xk2q+O(SGP?K z3YOL$Sb%4f;0AGr+4-&T0yNaP8rW$WaZmGGO13f@*2YujNjWsNRaM}-tQ)Mp3CKcM zd0ANo9KY98X(*`H-h*wq7xF{qR7RM*zem<)V=j2iLWO4#Ai{XMf#<0jy#-FR02lI+wc)cu z>v^U!fz;PK+-c(1VZ`3c&-NI&vawZ+H*NMZ=$dk-6wcKMNJodRt!>aJ2k!iKK!|k4 z7S-@t@Z(n?YczpDK*^)Jk8|?D37e}GzG_q1JnvG2{kgpMQC$8h%orxQazkU~n6)wh zyUQ#;>sG}k@Bv<>S5HQLxdUnwv4RY!Ty@SSuhi~_4@EaKM=>@aHP`Es>g1pr65m>W z#%idYz`EWT66#4Kd=6gag&x_cVR)pBw-W;ebpLx)Js7&YWr-cuLp1|S2TDF+s}9>1 zBo2Ctw~k8+`aVlu=uT9?J{uYbt)*EXg%);-!}3$?h1>p<2y@Vm$su$svYxD>KQOh} z6XmYdn)ETE#j|a*&!{=()TIYMEkf&+yi9%k;dxnH+!jh`%KH0eAUW#ae$V_DzBU@1A7iB?zNxtmjf@| zFW0U*CG%CaXiC1ZRL)4ATNShVe4ZYk{?cP*hJlQoV{fx*?mEC$r=wG6X&pH!&owy7 zk4u{E2WHVlQ4OzJ%(}VVvDLD9^QE`#6n%0bzJ|V5YA2@$?kovEdsKSa#}_y=Kd|zA zhAA$5NdP>~7bk0E^G9GBSJ#aw-o18rKY5+mG{3d=XuTNpbOUa(JQxmryvkBQ;0vky z0u$LOt$5lRwnY1@43GZS3+WM#57oki-wnIIQ3oOkn83=Q=luP^y;BH>SEbPgB!n)J z)9h036TwVA;7}JnJxpV6Ej;t}Ya!dVw`>`kxXSq2kFRlNj4w;aM>lZ!AyGvQF;MTi zw2}IiX6o5bE8RX`KK)+!8C7E!OEVUk1fTx$jBbdBi()OXc^(=Knw=VrgDe4uhjx*^ zyE0ry!OSge$@Aep0ALb(udRaw4p|Bw#SK!hzq`r)?LoEY>R5$8yyv=bC~ity&qrJ6 zz}nX3`JthK?iz2;m`8H;h*xV;sEB6A&!Ia;>!O|32Y4^Dh?{bc zW8?F5sY#sXP{5cfOF}U_YeP1jE#sH*=4n^0gSEa3$$r8+3?d3?nD)N;Y{43mWC#=a zakdSr@$fd>h7ciaWG7kHfTU>jwac0|TU1}yp*#ne!&rN)t1}0M>AJK^u9i2+OL>M6 ze5p=6s8=pANv4UVq;R`eMuZJx&Tg^zx!uXQQBG+E{?-`zR^#MTRtYVuIk3CM3!3K7 z0MIfU%ddkK;db;gijyr6ISVsel063MHN=O{F5KHJ{vd_>&e+_uS1ISRI%`+6XbWa#|Zc& zPptiD6lmXV{B;9D&j}}Lz#RavXiXJdckuHdQ};yI28@PX-T>9|*66a-U!IqVREZbl zxbX(8@%|o8tXy1NW}FlAdHC9xVVRuYaN2+VFV)v*kh#T&tCzu->;qRdo7*=eOFw-i zm~gpMSoLEx1e}oT8F%&4w7%ue0K25H(c8V9@o!8% zgr8c07{%ZHvW-hC-GPhoVvcv-E`R^{%5V1>C810{AZeDbcWT*6RX+BQMWJ=WeZV(b ziwrnv*lhy^2k)9@hkH$xTjq;yC9f2ZvYHZ$PYu_#!Q`26Iuxz=ffl=^6gf7SJZCOI z$NG{Z_Y;VC5<6Th4(%SdL)WyOZS`{Wnvy=ux594Uhac*c%)zfK>!tXQ&-^gSD@gkI z&ukEk4h4I?gH~xp33_XMhJw=&; zw6D_g_yHR_K&YeaaXhik!|Imp5=3wa$# zuFj>_lD}wHFlV6oIn*&_17fG19pRsWMvYxK&Ud=By?>y(a^ow@MRV#cb z*!Rkg_r4&y_e(??7FohoNCC^Ix-tmwlWLguK8ohIl}t9duw~P3s(ybf-A(H2UeZHO zFuV}~0N*kP$-ZF-$i=$hikvPO6)q6hb03W&Kwic$lXN!on-tTpu}vO=t$?iML2kLQ z`;Tviy5w8;l0KP&F1yb8dI_VJsF}RX&6)}KD`8C5Xw9i@kQk?D2+cW43WnwsX=$U} z3%+Umsz{lBX#TWU3O~sJbhR*REzJvv&#iI9Za$I>i_R+jX+!L=SZ;)Potur=_ z-$>7U`4a(L$cuL<&v#a89!%x~Wy0#Gi;SJ#H1V24H_B`VtUz8`cYLY&=RYJ&QXr!q zj^~{8F+CKp=@CI(9owK^TKzh{lbIVMf0$jPP`^6y%>Eyf=25p~rTb2^S1GGXDc3)T z$B~i@fj59t7D2Q~xzbhExzHG^$$j-#futc0ecj3= zrzj7`!NR8j7oxSz3{oYLOUP0Le-kVsL^n1x6hp_~@*0~o>ZBsx1C#rNnEG-^jdbtg z>vgH+1GL6z4uCnwl4fesa%YBU#c{fO)94~bWgyHytEBXcKRK0yp1zE?sn03$)IadX!TVi;G9Q z;Qb!i!90(VhutMPXRBnB0ciW7hv~J2ki^;yp;izOS;N``b6C%OpG)=R5Mlsr%laLH z8&9u*@}J@^N^ESb%GhJluFetKI)1c8`oG^R@ag5DJ6G$s0Ty#>9}c1}W^+pQj%T=0 z($=r)gRwgovJB# z{Qjbig@vMEYP)2fZwS|x`GEEf5hRp>3&xbvieolIv}P4hCg>`s^WYYlTVC5GMt+V* zx8%e2i1hXuxBPbtYyR(cR9yQRodJls$o)DuEeRPV*08_a{(kj6(zse6TFCiT?BmkQ zt5;#P(E%QW(H2Z-pd0hZ#H3%&D3>%y9mnq&<`z@{su}H=s|v*{SNF{9PE4k zuU9#I6?2Oc$J6vN?)Msx9bwdekaGn1VXPZlKb&^_*Z&5^X`k)FR?%@vN1jnFAxBs} zoWNk6Z~uehuI`^49G~aQsYj9hXniUKW)NU&c$q5GBA922A$9FD>eHYoa^Zx|bL>!} z(CeCnIIkS1YTM+aez@P40z!0GApnQ1epiOmd~jU$J)uNc@P^E>xnR(JO2+;P5?Wy| z^jA$#vMAnF-?vfYc5;WF?jQ!A4QRfm<`DJtU--Ry|B3KMB_f1ZGE1RibyVi z-|X}W!)Ne~6CS^U%TDMADTKZPTtVG|=>gjxkg@DXQG*xU^Y-*1_BB%Y)LsCz_nF?n z>sCWo7bNsxVFxm7?{(Ay1yU8iKI<0$CEAL z2|%DPvD&ljUyYt40}(D*n0?v)Muu%3OK&YjA`#gG8fKdl!dBn)j6yv!p~;C&F8aSP z;cb*c#t0^X6&Q|)Fo(?gQ}NmjeSqGif5UUJS;QJ>jti@64`A+s*n$C23E$DmQD+^` z4$-o$T6i#j*uDx`CtWk8JPhESE%J;($Tt!SR|f#_E!TfUN`86m^=jq_Lruy~fCGf? z9A#ge_HCM@5uw#Yi|#U0ATqTg#XmmA9ckJ8P}fC1$R0ZeLywXN-gs`&JC$O9V4Z*@ z*9Sz0?+!=!UxT-dE^)jREy+W|D$oaZ`fM)9Nz9gX8a>b)ezbij=VvLW-DK_dbgj`k ze5t+wL9@bUZTR7eT9w;K0e+{l3c-hi3kzKg!6f8ihV#8nN4@-- zitlYn;0fd()d&X&*W`7xj2_7ifL^Iv7Io#v*Z%f^8gZ9K!yMQL3ni>PXxdqy1kY#j zd`G)Qe5XRuin10SAZ1tqW@B-X3uL?=QnS97F_D9!FQaBo*C$u~r&=ShD&YC@&1#RY}j^#=fF@x^P9|NCjb2B4qD zfr>c>#jQPn;l?QEdae1g?!F01J&uhht<)KK0di_^2%k6FMxCz=mXfmb1#tbQyw5+F z-u%(ZXg^}h)^IjMEGma%b6apI1=d5yVreIUU5*zp4fTdcHq%x~c76jAvmU=}6=5HV zsB!n^tFQ&Mc{`$U7hU3}mi+2k$8tz=%FcX0i|$>X!BkmLJ^TGd*LKvVPhWv(0~m8t zK5Jt22UE5wn4Zwk&7hY{nD2OwwXmQb65sU0S5;f@f z4aZjP*puw?yG$CVxG%zN5Aufl(URfFIxk$}6hy7}-4Emb$U6y=;LBdgeo-pu@Kh$7u59{H`HHu`0tPou_OZ z=|hzLlsGA;z#^KX9QPQqsI#6S;+}C!x=+6~#_oBzEv#@)_6GISw&0u^cHw?7>4VL( z9Hq-O?BNB7qSfrpN*$-NB>{Amew}zawBZLW{8qBovtx>|0nz&tV<(zWuPC+bG)BTj zB?s83y{g$?cy>x;wb6pSrSzCg>)h829uYM~;S#V|`^*hLI60S;$^onN&de_X8OPD3 zMon4lhsR8k;u>#QPnrVFcZlRhO&Z}GDlg~~6L}{&wklrwE}jn@ISZdiCC)8+!G%xR z-AS;kf(wh*3#gtbAfNI z%K)|Yo@L+d#T1zVZQAfMh>(*2VD|8yfp-a(pvUvsOGR-u=R(r>B2$I*b(MijJC{}m z6gSGo2zxK?kpW&8)nW0&aw8vi^vM-d@~oe!O&2)72%F@n4L@pC7}>bdsEGPmdu#y42s)(U!P-+qqglCv+8P`5VH!XgwkFh%#ZW}E1td>Sq9TP#2(ZFN1 zw4(PPJO646T_0ecw+*1639XU*>II+Jq4Djr`krn{j|Rs9j=BkM#^)cya&Fr&wfhB6 zUNmq^%JF#6qp)KGdhj6|v=|rZ`VO-DlY-9n`R)UvER#{lEeDupP8y8@Wg&+?_NPbY z)B>pzzmSVtA}H`!y^eRmh(ZU#lR~DhG_JMkYrm2}ur1*+MJlE;wuZC+5Od-7`=8Gw z)~OY~ytjLQ%H(VX*wtnM&=fKW{;>r&p#>5!0qxfd(qmf)2-kOeh)-_X&}bGF13m%OLUs zUSM*<6=G$EE=Uecpe(wUK^*@ez|7|27K$<^S{*2$;may*vdE@BU1B;4&BZH%+D_@; z;FijN@NpcyI`7+)CiexFR4Uwm1;}E*<2ui)2=57}c}15^|GDwqX0LTjH;;#kjU6*k zc0)}`za^-=!if59kEOD&)QJNW$U89^8E*@Op-e3VT zo|*TIyzMT*Qict_%oSo)`B;GbMncDTCj+nGgELO%i%ecypbmNT_bBk6+SPYh&^dhF z+P)!4O3wM*d0I=v0B}R~c#(k(_4xE%5*wHZw*wHcVqU<7f?C0q%8@yQIPpHOa3!Tc zQj1tYbM^C`{|D=<$a&TBZxBalM;!oiWL)Oyf_7iXJXpv84)<8D%|`j~w=xBE=Xak= zDgevTy2Ja{Ba#L_?sqmxjzMzVfA8XhGGHot_oKAPjEqvw5VD)<-a<%2yz^XtFSN|x zeEg~GIVT6{O|wt8mw2r?l!XFy`1#j{?=QX^8sMGdj-ytnkXZ$^n!9{8zig0l3gj&` zQ}@-{b-lthU0ewJnJ$VNF+fvV26$aB1bfl9>xZ)m+-Z6aDaw@`(50HSUR+daS^|t5 zxeH@!KG`wArd?$3bOC`yV8Xy^p7o$toS0)~;ro-YzVFx*J%!ur{6O#<2w5+Wy<;6hU&krc8mEirRN%9YI|vOi#SGMA zI|c)YWHhAg_%6E)Z2EfV!`l_BvefGLuIamdA@>B;c7M<{u~FL2Fg_qA1x-GG%x4eo zK!+JrT>JPZCYVfy*6O9VpAcu|_2wb6h{KFi|0STl%F;F55cC46!SgECuqg%&5*4N9 zmc;;Wea^p*O#$)r`(+}mBxe z0+qp+&e)1U#JwL|+=d!y3VzT>6Td)w^BJx3>BosRVlAj74NCSW_{Z_9{VwwYMRx$I z`R@;jUjz?kPe#O+kGnV-Kd;?rly*G-UJzi47|l#9J`Ao+KI>`VmI#+a^q&MlB1oj7z3U;$@U2nS%P)=Gwo2+KX6B%WkVhbwcZpivWF2wWI-N$@vmKnqx zjP3O3tSgTm(50kco7^}d=}{uN{d45d>OB?Xk$^|9=WW3|(+wD|owm02o+8VFm+wK| zck(9HP!_ie%)6#`&Dxu*kqLqG$EN61TdBvAl~ZV@oYW94r_GnT;5$8jQ-w1CzS*GaIU{Gfif8)By*T*k)nb_zU& zyq`Iu40)`1vTVlyWS}Db1B(>;X-O@_$b06A_KGY3S{JyU8^ zcGSlJVAg(=y@*6XIbN`+*rZW+?Z;^F<-6+8f%M0~@|p*yA<7=&p7L_cd^7WSbusfO z(x3?Fb%gKuHzFet8^+hkJH93J4$PP&L|(3rcr6gvF6w*F-EF zXaM<)a&L=0B129wn1NzpFgTUjC_}+$+|W(OhnbZAwJ1GXwb&9|E?1`v;p!#7ncq5c z`pY-DLfI%kufryWv1mvwDI@~%qIHrv?Q?-nTm(p*kLO9Rds6XUzTQ#IXL0P>AE=Cj zvJpJH^zJz~+2MT`!4hi2bwSSnt^njUMMxiLxU-?7`FP;s_gAr4x{_}JvVe$Lf`i*7 zt_n4Y4Raaz2>2Lupj9xM%Q)pqVxp{$Pf;)*7*uX8m`CB(*FQypy{nSjW&2|L1%L`4 z%SrqJOxliMQnP9rtpH3YDZ0cDCN6;v2cb3x+kf4s*MBIw$_6dPkL#WQ+JVsCe(q*m zpf>0oO$52n|M(-l!9;NU z*aU;O+E^02k_`|PLS)P77BPf^cO@kPyX zOYl7kv{gO>Cp5e*jnyyynQr%^Dp~0O%O#Uv9RgFf^U(VDyYl}t+3sAH5%Th*ryef-7>EqY3B>N~a$fMK0i*jO1rmkh+A_y<3;E&&xVRe0B`2iE=~zaCNjYiF4(fFs z9oPm(d<8yYq9&0(`R<_e+K*rBQ7F7CLjNjcE&A6`zSTVbxV)jFoZ zM}f>!>i#J{GHBlodZQGZRdka)ld%~{D{s;`K%!T%U24HI=qu7R3xG~5`lOEtxy57B zz<+xd(?@I#2&x&RRX-DlUvDyu4TWmPv_Awee(7|=v zprS@<; zEr6sYMFYUo>%;Bih&Qmp#{qvZ)_|qZXbba~(NWO4_p<_mwgNgvt(CYMDQ{QWnI^N2 z*fsun*Ux(bAolkn$~76GG+{Fp13GIG7h1DZn;!vZ16Lef_F~FVD)9R)2=E`06-hFGznlVXws|3qNy;a3&lZ}rZIbit)n2C(6d1j)Md8-4> z8o|hLE81*=yaIW+d-NKO&g{%y0r0gNn3)<_#WU z<6fS$pJS3H=z@-S;z#h31{MjBG{AkaP?KZyET3Fw%fz>D$VEO_B6_~W~i zpB+VLHCnBHzB_Po4!E%%eW_i!cWN#9hK`a`lAj15$XuMe0FxSfanA2BP0ozWl(%N& zIm8Fe4n{37n90eRCga6H9WRmdlt3RRb7t3rd-Ra~dwEYZ8*9$(9;RPNNsa`E;sABL z3{&PLQ^A5<4fjt4dBf|AyM0$XSrzt=tAQB4a}^ zx?&L+=Gc?KctNq~U23Ba=v5C*dyE`mar>0OTlu~uNW0^Qj~C*$*89jHpJ^&vMqhoM zmFCv0s~fds1E}`lQYuLv!Ap11C+~^Zo~t~rV8BNp25o(xht>>Gb&qgNX;xyGf%#s# zvN#bLd=D|}0Ze;LBR}jDqo7arE&>0?Bw95TJz3vD(7qE2@XQ;LWM$f_o=@EcdCqI^V`itR7p4*B=Xg1C~J?HE8CmfSNqh+LXaNQve6* zxF5@oi^AAqCjAdI|1@`<;qAd!0l|Liz!otsISY>S5!ep7Xj{1`G7lWxE7KQyyfii1 zS$9_guW;Xk1?hAr&W@KFca=3N>(qE{J>(TB-m<}C17ei{@HeEn_av?uaOc>kD%YR8 z&pCcQqgZCstJ9{_U+$RkH+Z=2%|vNZR{)@;el==yk~5i>0N#XofM{hKEM@a{cCTjB znFV+k)EICp(wLH$r@-0d)L-;1gxQeW>XFvJ5$C=hNE5jYxhxqP%~j6ZsDsk(2TjqT zrgF;kE)f5Wk1(CICiGj3yX@p?KBobWYgct7^MOt1jR%>Sa{>P0PGcap7gHTk5q-3F zXBcbLW4`zR}o4)lmJx=FfI~V^lnIxo-WdU0eDQfUU5OO9%ReqI8>%P%^_yFaEY#yxG%v?N zg%jfmK;**g%;~Gvm7w1LYDYBHPTp5;jk2{o0ptoUbOH-N!d_eX4{-6Q5>Q{|d2TF$ zDoWH7^f(rQlU@_+LxsOWw!kTIPdWMP5$=G}Qrh6ny)u16lOj3_2%wB7^N+Trcz_xR z9XC{txy4zLe)9DFWt0gb5dN2&N$`(X-8c0CxH5x%q#nvByr$ z&Pm^fOv$nd0gV#RkJgW%y6y!XvM$Zh3cuUMJ#71x16CeKpZ&|j@8~hqi5?ZRo+BoA zW#R>`zw%03X;yto2Y|`<$~tVqd+2J(UBV#Wmh|rVn&o!6Q>2#99Y?GVoxthV8h#5X zB?K)+Ex13jxTEBfkXAhk3b%uPZll2hmJ%`%Pm^0f>)t4C&jOr-BEW9O*OE$feebUD znI%9sQo#74>M&@yBqu7@?d{GbK20yw`BvXnL-o=7!t0g}6kJ)pfGE^-ZT14l`#N}zW#n!1{FLgkL~n1!+s*Wy0_2>%LW*y5tb+qI;w zRX;JWKb!$LoIn=>-#USSDz7CC0j&PyH>zKne?;LqIyr`sdV8++3_LAyQr0#!*3cAMkk*iTWLgXd3LQoYJJ zmZ-UoKN0AkI5L@>5TKg+i~_M1h{)40fXtU%DH&uC190q*sX$tFh%}=NL{v`N2Gg_6 z(?!{CY5P$$RW`SpTlz(7!lM?UKg98x4H41eDm;<%Go=tz4O+Da% z-|J4aRn2D$2=d3r4+sm3ZSxTM2cM3ta)0TggI1l+)#)#(9Yljeji)TV5ftR~@c`re z(hCM~xcY##JJOG!&lLs4#$@;%{k6)OqPw$kUceLiab!Die*vP=7uCQ8?ZyGhir1fB z0DWFx0LjFN1d^3fG?pQT5i}X?V%io`M$13@AK6A!ngTVu4H$6g0w5H5Y=&o8s~13E zj-~(2*-vWrk`+i&sLX889`O_KumDw~2u%o{Q8IGD8?+V+sp>X0z!iDOqAR3+%T4>U z_rD<zaU+(e`t+q0we)Dvzz_>VN zy$wu*u>u|5c!@4eLmhx>UmN1Ir2vGy^k$o6SDwm#4If))J&_DI!o52oBM1l*Vonulb5BK@1$`SQHg zKPkjgIS1kngLcLcnhZ#Z{=}!8{oB+bx&|p2%~`II(PjWlU6&Mc{vY9db8jKx$Plx9 z`QDMEr_Xwv4QrRU$Gv~{bSO(I#}#&TpTK$!dmo|n%DNvzzJ4|BD`m~+MzgP;eK?eQ z_};VA24IS>f$XU*qM>)B*!EIlkwm3Mo7nPJ>ol=OLPCNh(bn1;&f4&wPKO6255h1SfX8n(jqY;oPHWmb=JU$?zE&+>|$l#KW6h%eD zLMnK4(YLP7ivq7j* zkoQ)C&<5yeo&uN`-K`!G4Th1Q&Foe5VpD%QQ_rEsCk${ z3ZvcdYYNFA4;;cBkw$|crJsLZumx17f%UZE2blOCv_E}&?$7=IFTUvSF9`JmYqZa6 zFv;C-6EF*yrV0kV2cK<%_QyqzqjVpyG`~zmYm#;^Hy(S`|F$k`$Dz5fGaO% zzCrWP*68ma`|T^ij~$;KP46PuUiks?n$G62LGCPrw9> z`-Xpf<)_J4*pjC^Dn}_x`}3&%-!Y$!%Fz=?xss(1z&C4|(B(q_XPdMI?kput zMy~EQb{gnspGRu$Iu7~_s3kUs4~}@#Yp^{R?tSnDX#Y!VFUMKvmQ;aUtsD@og`qEU zP9MFhp&(6?F#GIB^PDsM`0uBTxY9nFG?8{zU`K(P-`OG#zV~#7xyVTY;ciQ`E-y{m z36URHf+~T796d_#>rZOU0YGU-YNL6(*+M$}OwJ8AK+9ixYv*N3f6JS^*2sVo0V*Bt zW#DyMTKMx~n`~M4gX&26EZ^LBUyH)|4E0=2->v6>4qxf>-{;&W{OIF`XXyGOI4W#` zc*wSnL4N^BlP0<0*>=qG5-;C{ezWlVhfym)RJ4Zj(-K|x4BdwI@Avq;F(rxa4x#!7(Y3$*S6gvJF(@Z+|3re#Q2xzI@2Z@8FF@PWS>QQ_{b%R(X9~e;V@e;nnA6%AV)`oXF$3KnOEFvsS&T- zt3ADb#%Q4U{YUU9rLrz!yRL~*Q9#oO>3L!L#b;NrwLK@u-VlGe)hR-3Q%ZF^-B49Ju?0-&(2a0g@E zd~S7K(YyKl{EELFkrY|*s`<>$-f36!F6R8cZabThRe+sb6E>HqIj;&W-j*Bdq?4Pv4XjN0B&S4B_}oFzHC$RH`PtqkQx* z1pf$A~iW#FOR8ATZfL+e8MvD74TcgDC-_B0jk zdQ!8N?^@l|fR3S$=w1fNhBf_i5O4nZkH#_dkpN(4GLFJ?dbS#V1EKFm0bSNKfQ=Tv z-e^*S)YFG&n`IM%8mC2TR^L7lDaI!+9cf5kIs@wnA6f$a&Cqz|n3`O=u`I!+M(2Ky zaF$=sx!VP&Z{Nw%_XzT+XcDMAPJ7|d2qCicZm?0sP9JVpm_u)m=1bZhxk@!SG)i#o z9^La1_D_@%LyE4h%@Ic&)t4ZjRy*HoP5`Q(#`C6IWpSRSe2FuWA*ds{F`H>J_uQvA z`%w0P5_<_!xAw5j`slhNyU@aAjj|cRNRtk{by_vWS=1CRq8IF=8!v$Z*^Ks2? z)Z|`9Hu1vfF;z_{E-RVzcqcHXx1zv%(Mr>OeVGZj8Q{(<_7d9$-2}Z)XZX%I6D7H1 zGfmZ8ElA-Mko4?p(sdaQ0RziZwQ-HZpeeAF8m@*>Spr3g%O8VYL;$t8o?J{yBJ29r zbO|@SjxPzY>K%FS(dZGiiy7M6UEcc^MV+GcgLMwVk(85HVxW_Y!l-7c><9IG3bpOR zfxnty)l|VlkCJuE-oZHJN&cQPfwLr8UnA^-x%XBP5H%{o`H*aKjBeBi)4e-vnUjaE zJ4A8oWxw5N10tC%;1#=oYUL^RRAs*1$Ipjbo23S2U(}M)Vrt$Hj0sH~-is-9oZcrg zBwhOvQQS~(&J7Sc%gnr*a+W*{%0DQ^6h3eU31B;+!g9+ z`bYhc@>V41+lk@tWXviEtG#&Djhc;WsKKj}oi&EE^L9loA|3i~fJc$zC=jFF?BarE z!i&BAhd``a>K3gT1aT2vU{(Ao&vOL?PeI#DP+sYYq@J?pv-a*#Q+iL;2W)oDA zIpZT;Q4$46Sk8Ixk6jRLO10k03ZY6(v=b1CaK_JHN#IdgJ;0U{7BZzfZ>0!bWbG~h zU1^soXBYv!hQJ%R(=ToaQj!UivhKUQ&bxaEG9}N>+e%0PdB&ve&RZ4WH@L^N+DW9U zf8W%+zn^nI)TfWB>EaiscP1e0Em3FCfTgrrYwihVZir?q_tuxkZExXM1J^3wYFa7| zlv(3_fN{;cn(n@)Fd)Ono-Hq_BCcS|#2J}oxPNFXH4J)u* zYp_N;O5GG0v@WVIun6D_j^_1X%{t-HYAMa3q7b`m8GPObhu)gr9^KoW>CfBdDQ~&= zDjp`)#6kpd5kkI9>>}@0QA~8C<=i5f;X-|#=Odjij)Jm(*X$!N2edKwiMw(N85xYy zQ!^YM0-^UrN6^SAXW|;Kp^k~Ta3XU1Qgu}iCAClyYA<|B67=~~+;dSJJEJrn$8x8q zKn-?j3fJpmqb_7tf0~Nx>>pGKtPjTLm_`tzc%~Z=_}omRZPlP=j;7u&Xy1;pAiE^~(bWsu|m+kV?Za01!F0s6uTeQmSgG*G)pt3#e{YV^592r=ks=X#~ zYmdZ@tfh0m17?pWHPnIxbatSr-VFL|`Y!6yTV`vOgeLf|13lcaTCcUV3{_G`p?VtG zq-A*R>{>+Yu{q?6@UOT7t^}#OIjVX#}ky^dpIVn@W%QxtSTY;=#s)j~1fgG_-CH zW*u};rTe%puO8oF#S*~zCGB!rl;RZj9z+yadSuA6McAkzNf;%l1<7y3)w!<=TA2xl zws_T{z<_zh#fZoHH#6krAU>((dbG;p+0cEYaWy(#vTTd1R;LCl7fy>fu@(e}`l`0Z zLX2kv(k2^*fUIr^u;3&<>697w-DtuVM`sUFXY9=elosToqq>zw8dWF>3yzS^O{83- z_6Rcn0u$t5gVM<`cwjmn0k55{5vAcQrjFIHeU-ocfbNN)@a}7MFZ2f+pS%3ljqy0f zkJ#w+DEXSM+E}v(&^HFieY_Nnl_%37hmMrep9KLxGPcQrrP;E!iL`gcA5$iOh4YetF%)5PVnYf2KJ3rE7CWCV3T4^1bSCDzIRSkaF*!iXKd}6P@(>@eg2l>*AjncS zMx9crOamU899(rOF)JI!_i*#lk@5G~G#U?;Af6Khp> z%1}o_P^#*u9!v;=j=msi_e>5+-@f`@xKU`r_WTV2*??JrV&&gQ^~&>M$q zChF2oKAz7wZi2E>(3-}s0Q?(}uebC*YTEx)HSBM5`I z)9!^>-B$R{YD$p1!v!(AP%Be(irX}WqLH5H6EoE&;lr4hF&@|mi`lM^qO&s18C7I# zlVKbi&$x-3Dx31cwe&gNXT@#`Y*n@_!DZAUN1OoORXVJN4winnPg-W|(*< zI@06^${BO`Dle;p2b)?ks|Pi7kT&(@?Kt}m?<`M&HoxROLR)jdeBEo@zO*Py z0AU1uyUZV}wB)m1lvFIVCm^Er6xG$2jalbNgQRUlFi>2!cK0^+c9Y_+Ka!=tnqyZ} z?nq-<*gpq#1OXWET({9BR&45Yt1U|h4UKe18?wVq%$3CMZ%lw>@&tgLx3_v;^M7Lq zWoo)j%i$68SdEZ=LxqxBG55subzO&w@F>r#NhQW49lM6gIf(YAmaWqEKqsN2Fs5q8 z<3hQf$0)<%2AYzLk#!?G^NpXXw&Ro&5uh^CJIYcmGEqcTF-RT@Vtbe#PLhnYzPG;> z<+&Ettgn%c8SNVA+XIz^3pGVfS?JzC4nD2diV)k5tqfXc7|S9wg(QKjLS*~9 z-bUs4%leopORaR&+4#gfs&j<)Xml4Ny~D+K_ca^x14a}hO-$pbBw|hbn=ZAatf_Xy zD^67BZ!^ZBN0-ba$pqzh$= z(8cehF%u^R9T?gBkUK;W(7JEtHA_xd2pT7PT~UDx7jsCmz5c;vsVd z;q`*4$=+6W?p9RCgbZ0XVFaCR^$p^Rih1|iy;{(bi44vVZbU-{D!N$j@jM^-iB9js zmf4Bxj+oj(h<<>PYxWX~?pF*b+4g|H!JUSOCZOspEanKH4$lh31e=WCd#wZ=X}ZOK zaf?pToOCy6UjuX`HXGm z*?CW`b|r~TSWdf#c=YGhNx(Eh$Nj`0;`HvJrn41`nRORT7kpnw}pR-i%Zc5*wurXkOpmD5o#M*7F8jgxpB8`Cz_lfNRbD9F7me&KG;>xm zWJZ+h5ztPENpYC%Jsi{QkqD;Ro3OU3=?@fJLXN!>e0HFBzd^j3!2#mDmORdiR}@kZ zvke>&SgTWe`X(yuu=tlo5foblgV@xf>swryeGfdy9|=++g3_ROe{Y~}F%+s^TphGp zbkALB#C^#%?N~(7JMRV4jxuQ470BCuy_f+X-A4`s1Ev7C4jtVEZ;En(bWaIV3tKc@ zVA9=RrDW*a*dQ1qm6P(!cdv=p!Y3BJtC6TH`CX?EB!~fCQ+U|=Zr9D>#Ca|0u99?j zqs*!osEoY9wJ0a82VIVaYRVH06@f#i5`Mx0s=99^D)rxPMRC~hK5P+9)36JAteG0L z8&Dq{lyAA)ng6%IGml-Wzxj?&yUi>HVDIwR6f!8JZ6fmOj{9Z}v zJll7kfsA+73kOB==9L~PvBHtYvOysUPGlR~AV*B%Yym^23GGdbpkSAg6N(V7{&->7 zhD%RDj8}Fu)O%K?1|_bSO_zOmi;_HR0ux5suUyxsmU zd$mfWKJ;7ulHy7!ErXSaaGgZH+999q#YKUN#jSby%<^0&G8N|HQ(lXB8gsR0s zW+=eV&-UZ)&kPbnTsE0pkxekL2g+*YZu;u?tIK)I07&f^&oLQ~?Rxw%p7HafQw!rl z>ROZ$mYDZoi%|vr6|%SXm^Bk&qDB!Ht0y(5n?lX3=7=IqjdRY8Pk zS4`MDd5wPa+1t?b-b@3&)Zr6VA2XR^lvsG=pimq!VfkxagWziuuX?3p&H8Fu(e&a7 zUFB9j)w^7fJ%a`n<*?2>^4rz-WL~WBbtS*|+4#;0QH^#8oBP0&u}%jYFpzss@}*{@B4t-bbGM^H;z7)>zg+R3wJS zvtx1H08_2aS1uP2f5)fQW;R4U-`^(mX||y^G!xnwCf>X3%<1DU9^p#5RMYbyj>GUW zrKV%~E=0HiuY28EFLCd45Cp&;nBJBh=gSa@*zx;DTl>`e*R$HvtaHTYIQV^ym2iu~ zhn+g9`aAno>SHv;x$n4_=NS$Z+ukDU#Ta@&)YJ8jYshG9FH#{Y!L-7a~2`x0^`5OTkEBs1eaC1aB(anDxyS2mv(w?x!nF*JsI23$J>1N<#Wx{Jjb zmzV@P=jH=vc|{zNjc0)B>{ZF-!dr{C8pYqDt{+b6qGKug$SD?8{6fe=@1_&K&2uI5 zp~X6+fk!~YfpGfAYtCtm{?=^uo_EzdC90!^?_k4aT+P*p z@H7rsQ3Y^N5G1OZv~O}CYEr|sP>dNx-`dcW4E`(4Nc$V_id-Q1d`nN*1|~a0cA4r1 z*0l%UUNvTMS4xL?nqWtR*X53?d!Na(cx=MePk=vw7>D|7L_G7tU*UysWZF%Q*5=G? z2*s?!e+)?3Azoh2`3b|0*4YDWJ19FRijAtEpU94@la)Fb-q>!RdRzTWqVAg{`9>8H z$TG!5uXRmOz4rugPb`}edkD=~(~{TQh%IN|2*sNYX3rUU=ZdAUnh$=e@09Iih$u26 zOHL!)BE3VheB8IIX}5CyEm)>n#y2sVIwrCBxQ*D{*Zp+PHg|HRTOHUojm@dfqw{O|TtS2}$LQeEHr6^ag+kxh#GnL`vl>IKljK7~xHL$mfAmGvceIMp= zmUQn$f4&N;`A5f&>e%iB3!GtaWJl{}`U|R6TB_~5zHpl!A*;l)MZxg$Md(|omQE4r z6;-d_KnS-hESmEud2FJPZ-{Sk{-0-SvW^A;{%J=3PORq^J}M=uaF=cY>*gFVd&DI+ zd_0U_AUxoFn66^cy>RF1pcTG+-O1ruN$J(s>R)x+(Q1)hC9Ew*ofRln*YNuy zuS-~oxWmMPIN#Z{oEzWt&#rl&zr0vhXO3?;meFqq1?%#pDt1`o1xCr)XksJ8T zina4QuEA_3y(w98bx#?JR9g|`V^=o{H!2Zfl!_cfA`m1yy)wQ>Gn@&Hy=CTV&W10WSB-uA_BINL~_##-}Yr~W;wEyj&xBOB48Tv03it%32Id7_qcZqHds{O zAgI`@$A?%7#5gy{jYQ3BxM!J45|jFnjy|K}$|EJEye9QKoUkbI>4pZ3VS+wqU*2T8 zu2hf_s4mm-FnuY*i17Hno-HG#33g6#?IG`l!te2RV&BHM+m5$Pi~zh(F)G!}R&Qm& zHT~^Gq=-A8K_X!zA{;ep=R>@damr07YUMlXkGyggn~20u zo(Lyhw|Og&ajLqRT-e%UVE}E9Tme-a+OOiuls!V6}F$O>bOl64cd8@PrJR+_ri|1TyxP+=}Y=%?4H7QT8x3Jy*VJ#tyT$%H~sx zyY5YfBWF4Dl<#B(5eN<)Di^D{KQ7|dgp1-~=h}O1Bcp0SB0dUfqy_}|XU zHmU#=Rw21aggd-ktCV8Jd`shlWuixIb{e}{+ixM<3}WQBjc2j)=o*KFT$QrC4VZ@Z6Xt2fhy~-gn(iM;%JXoo_ z&LNcW6_6JgX@cKPStDEbHt!TPr&qKp~q`t5L($24i#a~BRG$BlsNTENE)ySs7@zHP= zRfIM74P-pkR^#dN)ml62yM6W+%w(Xv%y4frn1r-1stu}F(PLp7`y+46lxC9zAnxS* z^s;uJzOvN9?~HNb0T!?a+33-5MDc_qDgz#crsQM)5OP(Hkn&&6QyUr(?ifNEV@px4 z*CxKC3y$A}VH4$-Hh3mg4L?K&9O}`<H@GdSVsPAWJ+YMp5h1#>5Vl%vmA!^9kF>9Vk|KsMo$I>SLg(yTZqwRDzG6Ll?&i0kF#> zace$X7l0GFx!2*lvde?HRmc9v8Fy5t2&x^WxmtR%t4|r(>QhFJyiMOmAK6LokD(Xq zBc*mzJ^^!67Wv+&DC!)5t{1z0 zDfA9uwxFw{0(PEYsExpVF(k0;DQ=2U4A+*w9t?qn=5QArW%9M32R*!?Y-a!kD>B}5 zLeq$E6FT9^MEHtbVI5;uOCb{qMpu>f+TrkAlSVnUX;kQZ(ndXlT;d=)>Q%H2=n3>N zHB3QlTIyuc2Jv2{o%1lg{|?hB9~OChok=8;4Y;HB5_8_q`p$d^9Jc}qo9O~{9KsIu zsga)?P-mKhcAnzlcwzBmSa#bR_>b9#B262>6v&DBnv5Wy4I=OO^14rH3AIKGbp=qg z3w1PR)%~=ZTeeekPvBgjhe0h*!fB+vTlh1g-kU#HZ+mG6-XqPjJ?NS=fq0a}#0(F^ zM-K_bbw%K3qhM?QXS-=6e$%wE5rz}o*Zkmv`9CC8h=*ZXmQ&eftaH@5T4cW2q%~$tggsCaSuJ(X$ROFQ+W#y>UPpv-5t(|<;dpC-C@egfGeC4BX%*6`uHsx z$i>9+#|R~{Ex^wSRwIoC`+OI6ED?^W17%}S*LPu1L5pzuLJT>LI|#tAA@LTSjeX*j z9t;t|{e#TQuQu`#v)IIA0j4p8K(~5LCO!$Q{?X8t=Zq_LmFw}!$(c@YH5ts-(UG=U z)MIFj<>;rhmncuyml-=Gq&z|Y#zdMCJK(Q%r-4zl=Ms{x@RD7XeXb;G2;BfS8RKJ0 zK%J!2VqCW!0~|WlMW39@(va6EuG^-!xKgkUO)l*>zqF?{xq*9e@oDHY)dS{J#%BoD zeZ)53-J7jVj0zLa`XtQ{51$Gjts#f`0s^+UP}wP;75@{{NUq)5pB3}@ag3ZNc@0f8 z1<2?Q#s@8^^5y`mN=X}jHoSu2E7Aoro%+Bv{@H74@#l*?0p&70idSj_?&P~@1?NST zq6!Vfykp{Yyv{Ne*``1lS|Vm~fL8b!W1qA=H2v7ObHW$D9OtsP5-1b@W`IzYS+#kA zG1N-NI1_hICiBhd#1GDNiyKs3pgd`!ADu1PUUV7#E zsk=T9vAwZVzOTM?NjhT-v?nY&wySgFsW1HNGghiydV1gVp(M~wUfHaW{IH=ZckMZs z{P!rc8|iNnNq3%&N8w}mx>^mTz6Y&?()U7R*QA_{l`tH$zR?IKvTrHvuy~ z&aXZpr1j-RJsv`L0(V~6YSoKQW=z2*h4BHr@+Xf$yQWs#i z(5-joo?$0p1%n!6(&)r``7K)J9*`X65y&P8EH|RdPIZgI--D)m?VBCA*O4Gos;vV_ zj*|dJ=Pj3*0C)x-Hv6LC{W*2)^F+T_P&p^ zKxp1qBNCGU5yV-atw^$H2g2JvAehH2a95hLt;K52nGhZ8=hn|J8}7~|W*23|ly%Vl zH^v;`8s5oDzS?x9B$Yr5Ql?I=|ac@1TZ`FbyvNPS0sA^y>qko)XgO1N9z>+Vr(^x#bPj?s!h-Dk9>+~ z;%$_7-pEs#r3yCtKHpE0mJ%P38661Ah4kwaJsuIML@9JRmH6XL2v2&`MxAocp zt#bpY*WY1!`(98T_vCKK05F|{vjkYdWw05+?`{Vv8lYMo;2;OQL^g4cm6&*Qv-uLl zrC&+YeG6b^GtKDmZ_U)~DpJ=8r8NuKQ~jT)LlK^qtmGEz(e4AZ0&Q^jMjJpXFX9ft zu%~^h+RzmAuX0FGW%5WYWkvZOZEN=k7%8Rt!WP+Bi2QhXkSpLMsfbT(2Y^(qLrVqL zxWJOGPG?7)Av2gba?yrpF)bcs$kMm8?rgckNA}|8u+aK;JtrEsM_HaIT$NgOSGOjq zDMx?-G(JO|U{}ZAQBn_G<%D$u`V#XIWNr^p9xCX~87C;&BuU-bx?pD)4R64QG=)8U zHovfbGn}16WEA8^>b}ih3UndnDw|+#nFTOKShXM*Czyw$YVeGdUTgC^4fKXW-*>3A zH+>~T#^Nu2T0O)l@u>!PK$j~Ik@%1dA~pXY+QEEQHpICJ1~8ssF6t&U<@2S?!&62d z4iE@{re?LNG7?TNo{M7Yge0)v@^w77IE45M0fZ zKP*-))RQE}zpw?DF>oVr*Ih0ex{Bb2v&{i;(Al6}&9PA;FVW-qlCN+oHX+x-V7RE( z%-x&{nh4Bi=0y{t;gPZrVtY6dSFNOLj21Jt4No`sntD-{BvhcBXa#hrdg& z7jjpG0G^=ijo>Qi&`VsBqP7TTz@$$TfWMrBA@>hivf@Zh|X zpE~JP8Uj>HgWM;0!-G`cFw+ZN-ph9HK2%tRu6Jvzza78=u2wk^8Ay7pT6?G5s!%Li zayBk#M1ELT_a$qu9njEgFnjy02GcX1K~`gwT;po8YFOk2l$XwQ-B9)qWNEQorSh-( z0KD|g{sj0^q$l$nH@Y7e+~{F+5;2zql&2HU5q=2++2YFkJ3F~+I%B%ni^*bd3n=L{RxhXcub6% zUWh2&=qa_e2Q!NL)6&0-59n`fUhu#L*2gq0X+;jHbRNKJon`m41oG1CS?tG$7 zp6`jI)sRA3A084dYIFPw!(I5c_ru0SiZOBdB;Z8 z5tBP+*w@}9UNjQQD5L)1iTiFz9ws&!C&U$C0by^_b(ZP=Ttw@xG!e?c$kV44F9`@2 zXzR2p8@=!ycPoP!Ev=6hMbZMRGU**d%9z9{S-iZvu9PowLsG0DUYD~ESw(z-&&wdt zMifbs?ggLYu|T@xkL3`{$9+$uE0peVHqH8 zbjcPKtZmx6DC}W#T+WWYEUs?nl+Jk2Avf89jK@Mt_nx<+W;%mD75i+G{LjR636&B|O?1%q!g%@Lu_xCsq7JJ{Kb5_Q_GUiOBnB~Cg6 z1Ke-D74!|L@KnjGz5&k%tj6ei2CNp z244w2LnS|6`mq^YiWwln_egL?DdLYeaY^;Jc$+s2FYh%Od241ujqyT}Tc*hc@u^YlzK1=A@D5D}_xl+S@mB`KB=OeyFT(3;y9XO@TRNK|a>v4S1Rf>zmP=*Syv2L%$ zn6-{#o&pE?6S}mEuDmG>zQKJOGdT6eHBSSdmReczFd?Kj$* zi2&$&;{IX7Q&X&qX~+Wt%@9kmDVV;w3)tjmc=7qzPURC0EFP>7-@WqoH`)g9h|rr> zak+902VTP8nN8Es#>K=c-tx2C45r;L4`QERQ^Kq5*G6aS4Ja#$(r zv&r16B2@?Z+fPT zID}d}A~Se8(T6O#i=pIu-cvsAu)S1im%4HdhoYE;@-^XSH*&GV=S@JPR^mQr0y7>S ztR6PI5VCk#CZ}q2KCB3{p*d2MDceejj9@I_ZewpNXS04hkiOQ2&J?B`+EpZ?^4X*}d@a5P*lZ7hVc~MIr%~x4TPh&=$vpG= z?l51UZUYS6P_90YDq^RMAO#LQ=|!zQe`_-seaZUvU07wP?$bP_o)0V9soq+d9Z9`W zsV{Q!_~IyfN1ym0l*}F}^&TQ^>#zzAM81q*Xb-%AJPDcC!Ib1!z?caag34bR1xDTx z{|$U+J2cnZrPNz!M~@0Su^dSRlqB1%OQacr13>&4Vh9vF=z=9sEB!Ezn15*VQ%h)fscyQ_1A8qYQS=+a`_!aZ?P!OA60H15aNdeh=kPRQN%Dn*`;pb^;i6{ug*a ze2-?^dqMGzW&aq9d#vx%Jq|3Q<|S#BdrDtjt_-~`fzQ1(7+0$Hv_U1v?YG1E4=vGw z=W4v%CuV8xbr;?|wC13#cV0HW=6F$ogIs60`IYCu*01LTv$zoEw(yGO-pfyEzT48N z_mnLjDoYQ?82J?~B%ao~ej4 z$^KRb{ZWVCKDr<9nKN~xznRp}{ji~mnwDa%@{eWrf+;xuouNk$oSu<<4$LTe_=}na z^Dli*YwgM^IP71|%Hp&6C}X-AFbdf-)9HEZQh}(@iTjIXRV4*;JbWQv@Fuz;(ICof{a>P?cHA#Ay*>j;#6P{N?o>MbKQ7Pd5=;cNs zcI+td4;dA#3oB_m`l1n*^r&&z$ZbkN{;R;%Ps9{Di2$BEzxGG}JnA04KRXlc7gp1b z#ocL)H2?Ec@BKf42_NzQmR#<(=TRpEMT^;WpScl@~&w*vjnYW*h5`sWw= z_y3B0yN_1cMo#71pI`DD)bi)8{rzA3?E|#R_5a(Y{F85K7ok3w6Z{u`YcBt*U-(nt z$j!}m@ISk_+k;1)RPW3J3B=or#m$0E*gSi)^1Eqo0Ak zL&6VjN~;X!jK8`l`S|Or5TW>g`cnMM#b`fu2&i4p2|I#O2Fys?@AN3(+{+;B7H3m3 zd+!T`{#PpSzx1uKec zuR)Xd__?H4v916Sbjgij*3}RP(*cG+Qa#%XGFVsO#8Ux$Xq}E9qWO=u#KVuL-MZpN zCVhamq=GuS004j;#$Ky+%vJ+hz=;*sS=b5Zbpr9l*O4c zuz1kxRK?ZMNr{E_eWqDyk(RgYoY?^jy;C^MPle-@gnw}86Uo}N(7=Sxb@zl=#yac2 zwEld2_2Rpy$jL_*Pj3KozA_=g$D3v6Uw`8;y6FDng3+716K4e8N-i81P(ZI9jZ$E7 zIn2b-;r8PH+Slnt2IIlx<;$+XFsTd}?zj1YK`gX1oK}>Xj+9~ZIGd^72c~c9(P06O zFaBbm{iMi%0p4bx z)sgti^4?B53j|u~5{c&j(OZ;!5*)`pcR=>~*GvBQn`qAfnaEsMNJjrJ*YH={A=4j3 z8E1|3rh=) zptv0jb)*}$vCUDJ@baktf?11j-x~uV-`yQ$dXz%Q)%d0xbG!1(xgS`88m>gbPflzOMnKhl0o0712{Mi z*Z?}l5|}3GsxGhn?@QoNOyy5Qba%j>F(NecO%_w8fm3NL|S6QKeu0< zbH@12|9)dI9HJtNwdR`hiTk;3n)ENtu%iW3QpGvQyqZ&AJ!e%U86U6jrdt(Wk3-Q=m*R=_^zHv4n(}L@RrtaBtk)%wl5uYO(xIBN<9E+NobS0wQLGkg9Ul;&bd1zhr@yK@2b z8RugaNHnDhf~si}v>&QcPf2XYoS~Ab+OI}fbw3+-8^L_(%UBpFoHPM6n~SIqK}N~+ zG}P0!4z$&pK;@|i<)Uc^CIj3Wn8J*jM+A1U^F!vE4BOw|`i@CIC<(IxIutuFavr%} z42WLsz;bqZ5qNtwPO7u!q($MKG!?%W0^EB=C%BI$6v9&Tni`^Q6B46YOF7WDIr4I! z>}&$)-h-aKk-v@mw;%X0hD*TvsM>W-K46D-)YA)QaQiK>xRzxfp_a9aYW)jPro4hg z`hs-*)02pt(CFL@>VU@$fHssVCT0n8%+PT_q#Ka^Bs()KD{DEF!0{9CO%&8a1%Un2 zGjvA4ORF$myi00hD5meY00z$&q)2UJbmEz5k~sCigumN-$r~(z3NnDFQ`0YP*hrFz zWU3M8xf?jmRkBQztZFl%V%Hx_dQ{C)#9M&&!zyglnU9Y6koKu)?ARkQYXez4Q=%`*T4?TWUb2@>H!0 zf4veg@tc5ceb<1js#J7G0)u#e<0vqkR>{G%{Y0vx)V6HkeWK0>fX;NSPPr*D8sA3` znai?}-)5@j*C*!s{wB@~@S57iGFAyEZ^);UPNxD!L8&V5Hs=3MhEKIX2ZmekiRGSxu9~uO8+4ihoH?%KM>=WgZz6sA3iCRe6oiqyEB~UE``>q>jI;Zbc4D&gHRJ$rtT-TP z?>RsJtnG=mHN4z601!nItF{&Ju^GN z$>FiD#!pFI0ENws&tX;*fKf!+;en)PiwCZE3oPJHsW`(0H^3JtWAd7D3~p-krH^`8 zVTn2eP(wH@JG+=Zkp>=!RhIf8E`Es4UgY=5k9+6xPAg=8`wSU*oWJ;f95B^VuD$X~ zD(spMdhQkpH?r=^ZFq%<5aedpdPJks*!F6`C#LOWHZIk1G9ZFh)fU z#Od1ci;7u-r4m+z1PXy}pXDQP{77TUnHBW5Y4qHHb-=A)7nk~^UsFR4atvJA^U%*s zv>vgx8ikhG#7#Na&dl6LhmU&T0QZd|29@6c(HQL^R4ph1dNRtfURg0~XMg$p?T!8L zG3vm`O}6`UVCae1p3rv3pw7xYAXKwETS%i zuky9LTAjRRQG?4`0BGEd)Fn?|3-5?nH`fV`o5z@^#<-GcRxyez9o!cBGc4cHStA87#s3Yew>@r~tDEqOd>udFXxy)3I$e@bKAb~CMGT@pjIhda$D=y1Pe6CC73e5N3ltXLK#wJG<5k%nci z^O&f9%fUR?11-7*66@keHzfc4Pe4Y+MCi7{t z9E)QVgDmX%E;rbPAYMNGH1SqXg?nW8yI+71RH?2THDhfvS9ANby zrkz2^YzO24DmCygRUn8HAUECH1>!4biNKuw3?dyUeY#x)VzehWU1?Uj1R)hv;s&?jB2ja$!NJ`lJDPyttzF2BnYbhsUFK7B?=dynJ9$3rZHfYY1~ zYFuA(0sh;JGq1MKLv4=0zmY&EZ9K4EanM{QRrYT$$uHlYDu00dJv5=ztUEx_#wLm(Tw z{mNjK9W0867k~x500={bd0gwH-O*984JHCuWh?GL#UvS*9cB7^Q@BaPG`IWYNDh+j zrr!*_%En$a9`XoddVvrn^>@wS`uD6#i5&(kg@He>Y!MrA{FOeqKF0)G9>ROg8}}Iy z<9Uv?)&jI;@v%xZy>I)kkk-Hdev|E;5{e%x=c(`6*ESCK(H18GorR;iek*l#$aB+N zft1?cB!G?fTAD%YJCrViWwcr-Jtij2V~64Ft8sXXPeq8A`lNH`13s5r3p$!cZivD3 z8sk7!vkrMALgPdI-$qr6#@_Kz6e}3$i6*==Cu@kpo;;a)9a#MC`~Tr^6M39(8dB$T z2a6iF<#rDucl#I{4R}!hHzen%(w)F#HDgPrJB766|BU41yaB7F zUN)MR#-IOh2SIli!Ejxx0(e%1yfF!CPZT9axwd}n zi2-&34Pbj~meLko;Vbm*Z$(S z0G1c{rk5DUpWlxY=-Pa*1)J9`YM@;>Z&g<&0m#iRV}KvNUH|Z}TaUDyhk3F^&S}{C zf@NIN)kOLWdE`>7T0+!Jq!$XYJ9#L`aj6J)~U951lm)wKl>t|XK&h6)&oI94bzjDYUMwiwkec7 zz#<@w#uOLMg8+M#Ut(;t69B`Ay#>n++tEmpsIl_e%=*u@|eX;v_22wCuzHG3W59xzLzhM=R zL&VRA3{mLbx=ANLB)SUN!#YAZB{SqQpSuO$sp z!EF@@Nc|_`@pQIi1++7DJuKZY6rB2UrHLFb8V7!qbaEp{`SjgCLznlQ7E>ql z)kPi}LPv317S4H}nWyghL%pu4*ixh872_6$kw0&Et-AYe_C^_5XF?ts3NrPCd~H#B zYU};1MBOz!-8E(=A}s zA%&MzquOZRnHkj9lp84h^^wz`UaKGN9TW-lH50Kyl*4_rheUVoRCg2LpbcY$lE19| zeh8q2OAcvgpuPZRlik#=wkPt;X;)z}p8~P|lmkYo(1zj#?xYz}{pHvC)B)s%rnM*} zLhaTEc~Nj+F% zlDaN3gDxQ2SM;pg37P;H_$;>0bl`{%G?0_NM9!nTs*1;B{zOsNZkNq zQ9a6pNZ+v=|MtMdDTy2%IRX8Drx4$Q-z zpQynGWXldpo3&af0`l2JgZ19Xxvg1{K4xDx0MNhwBJw9+(MPX~?aHgU_dQSkf?v_i z&V!N4ESB5rW%B{GWn0MBDGgo8vxwTdG4C{&9eZ29T+ey#5&Yf@sB!HD@mELi2IO?vP=A(3+(e4P$TSZ#bj+vR#Wc(`O&WGt#t35=A5j#d6o&=oGIex?lOB>W zF{Z1ZctZJ)<~894)9SA1UwnRkI*1M86&n?O3-OWy)(Juoubcr!4` zwU7L^yh(M$EYfnGT%z=v!-pU;zH%MEB|KD2ge(M1t6ydkrtPuy1Z$^^N+zUF_7AeJ-_rWQvSlU=RRw=?XM}%~>_*ieH8(a!K&Uzw@yxFb z8~#z1dGy^4kO=l`lfz9+6xo`?fva{MkS4wI#nk)?PK|9)z!VJGKn;zpCR$G@nVAy^VhmBBOkjvehcCP8&ER4USJs((I^njDE8SGWB*s#aXL8kVQZNDN%q@)+o4 z4`dXBeNPWHHmA=nA~OQe4LFp~8otVYVo`W>nBk8T?DLI*lD|719Sdn#FlU)eKBa36 zb+XBCEVdhD+ik7!_zC!cc7VxQIkA=RGl>N|Nl)Xr*1T)lv)G7Q$^;*>^aS`7xm}>i ziUY?@>IGZw1kczs;#wa=25R&NRh1dJ(zKT(!zI_{&x0H)YR`{8HXm`FB&YOj^0jO$ zJaAU|oTuv31`5f~(R1mI&~J{(9+LVl6ZA&Hx?B=_-PP|V7qW!XBlt4|p zPg#(~M#e8cc0xRxflgLs6@W8?ci&u^Uc&lFf@3V!j!he%pvq%w$Cc7{z$0aB?0S3G zi@&8la*C!#!ng7Xb)E+WU}FPh)b@IKjlShj|6x$Tyvj*;z@yO*M%X02x07GxCX^FN`)_|8tmIIx9+_fVD0ysb4{!BR$ttg zoX@@v38@2nabjw|X z+FryiKPC2i)?tWnUdf26%}hr_4~&71o|6CbBa(?CCopT)iYn|h%aom`66`2c$ggiF z>cdgyLs>s^Ea=UH^{%m6w!}kxRVR7qXbPG664IsnTs_KWSY|Sj8FP>@Fc7l1d!djU zZ(3PSzsZ!d+gx~VzO*T^C^#k6khN+!dvIX@-?&Lti-SaN8pIx z(B==-!%Av1M^rK976NUbO|M|-`&5V8xyur3#n?gB)r~>)^^kK#!rHll}Gz16?zxy(2lMW5yQWhQ~s0*;zU4u zOx=XeJ-V17C|gi;Y2?$= zJ*Oz`lho_G*-ReGg$6_YMhQKN%l$h20Oh@PPtw?K>p#|Hyd4o8i9q2>G-G<*e=JBd zNSrsU_S{e3-j$g%t!98}*W+UfLm8)@c-n*y^5phS6omDY|5e#h&F7EW6rs=WxZAvZ zOcQIG3*IH>5y?0~RH<|83XGIGg67&AuaZ#d0m7u!9T?nkveu(zx3r1m2~SdN=4LQd zvES6zcRi1?81Z~Cx?#= zO0H&Zo~;@t9?Y>-DYC-Sebq<6fggBr;x&5l@g@{=cugs^r)d!6#AmwwwghnrXzwjCMlML=_H%||&^d{evsbWJ;*p35h&50ZE&{xh38y4@Wb+x($&dwR2J zZUSr7kpAsB&~Yz93`cd8F~v&bndg$BigGl7$JQbLNydl8rrJrh<5yd&s+ac|h%M{* zs833WXVI76M-nuzUzOR{Jb0bt6t#j%;{;xn>QAdP-}s2^n683dZg=TwgO87%ZhP1A zloGQ6_OJj;m4V`>a|FIV)2C* z`PO;}yS&e%F`DS{ z&EE6dj?e6c2D_qNof@if?XThkNvgax%|-GTA~sdZP7TNrL11_u`E5`E|5^jV5v!8t z<>sHV-HD}JwSQ8wIPmL7>;dz!aK2ip#IDwqm+=0TB!C|w#OH~UK>s=OOd=7cQRyu? zLl?CTkQ_8@!viwV-GEDa?M=6IO$Ij8ojF^?*=X%+niDP&Cor{&W&P{NoYTu9+6Ih_N-fH=UL`FD=97rCFAET+*gq%Uq{tY`krYuk&f=Idg1E{a41$cEf)@m zAlx_N)$5v{5F@TU2u|W-t3dFRHesnZX~uA7C12Bhl0|m)`x?UAViEVaV)*tRJ(KiC zPPJXIBmsd(Oh<7V)-(=-1ioX^?LPd(^ar(@k#pgCA>!G)e^!P_d*nrt-w9}M$YahYdd=|gO8|BwJKuu%SpV83VNEDA=oL8ov`TQEh z+&cpuwX^*~#sRGLl`7lyvK1c-b*a2E>t=EhA1{zd#Vz6r{u0%L10fC5p|wzNtx7oC z^}L$2NPa$8Gi;{@bKXU#bxCxDdw-TrhKqWGAcCW?!$LA7v9fCr% zp(^G%FR0;^mqv?i8C2AHObPW8AX|66z7fI9Bmk0xp_CQFV^W0oqx1`7yB;_H2 z$l4ZL`C7jkTUL>(a+FgA!&K2v)ST;2+TR4{!J*0i2e1!%xKEN5`@muJm2CKCk+)p( zGnM*?NL3GUTbJUm>cgs?cpjWM;<6HX@~VN8)z|HCb$)fBn=6ma1M6fS2BT`HTEz@= zf9ClM%Cl;@WH%vj1_+y?;F4P86FX()g}$1x3}wvm#k{!znvNnl>+=e4Dnm`q0*W(} z-$%vN?LlMLH*<=rC||qv9pw{Vg4Czv;{0I;S2bl+rP_vIh!Q74V7+R>sTLK$4x@IJ z$p}tCPgB8=m&F-*iVdte=3O}3- z5z{tPq1@X2#0}wtVU`swO91@^a|kxT8tEAxpD9QsY)0*=kFp5WF9e3KeLsKZ-1PJ5Kv@>4!=DlFnsW1g1+h!`GV z-i?BTY63UO+mmkhX*X5L*^}`X2Iz#^#$c7u&rzlZr<)6^Eeg=ENrHC>8w8=g-F1hY|0{+rNhRshmUiF}=r<*%qJzHfebrbd!3pII>R%LtlFqbRDYYvBRP*qF^ z>c0mwv3*tgxvMQKHEK7)ryRZN*_mtX5ke3Vq0J5CMp5^GRzDSKXiDf!abk31`Wa9AyXi$H$nHS-LY?dgz;G3bStWO8W%(zh z-)OW7TNLj2hh$+FhT89`q*1Nfm4%v94eiA2MqDr`fHm^$h_J|f%iU~ySAd`dtdMVc z5Nq8fnEuwqE1g8kH|l*5oz~$FkMF`(8aHlwMRL%A$r94^{mi`rfbRhZ(8ZqvND*lK zVaP1oq9NHxBOpuzt+(MhNYO3seXQaat7sH0Uuw`rTgc=ttrnleV46TC-`|nvm!6O5 zXReQNGf_810eh}Z+W#oh=15yGHBCXFmFL54J@yk!AbqaA9pN<1+Vo?kwC(z0Xal7= z7#kd1qgKXbo1-wBtF}zE9MtQZXl2|#<8-x+)ri(q`v3{q-QQ1jYYArG3ug|ob=sML zx~VVJnXxG3I@M@4hBe;6A2tH2%j>L+*Xu~Arf-1L_T&o{LQtMrBSv_rK*bxLL;`o7 z1K;9H+ZtF7CBe>&^HAC?OSUQTqv2#lQv#Nmn+tvutc>bc9dMeA9`0;_y4*9RHg$YyYg|;*BPO+&9>rWe;N7rf3*Gs7uPs$FO}gc) zud4`U@GopI6s)i9o3^R7csR?zPs;CPV$!W6f8zKPI1%U)*lXsn4OYeXR6YB$8@Kz+ z^=P&5W+>L%*W`RnC@{<%c2vd)jr0fv#fBqhkfqJgT{Tx;xeh|g7NoVC=kNhwr9=K@e}p?A*1phViFve@p-Mm^9^HAPzYR z149X@HIFR66fS8g=~*n?Khsz*2vhHOrC@F0RLnWiPb?QiK8*660yEJ;jT38sRz{uV z$?R9M%&8YvGs%gW)6PT8cycN_#;$cQwe_N+88#y>aC$Z)m(BZ5+G@3Ss`WOv@0}zQ zth{+!vN!ng|rG=Ze0=3qWBDByhIr+5rJA z5to@ieQtnF=%+e-p4y(g8*ykWcD08Wa{jogjGip`Ay;^)^%sKyx#xIMYJdbVzxXB^s^V};8q zs5(t~$TUGnm}v86E|aplzuTHx6fIRU!`}SUyq=|kKXr3?UsHcHoq4K2E!-lOtIoF5 zUtwuDO$bzEzc)0@*mu4ocfr%;YnKtdF8rE=IWN!xB)&U9b~7dQ(%s#!)BmqSUm$VL z#>g!_BYGp>Sr62EZL|g2Rc2E}bl?5ZJ(bI#EseIJ(N7)ZJqd41&JRd!a31g&qA7(@ zillX?0Gp!?LR{>?LWh@bKSpOoag$FRbS3!@VL4WHrg&oKUV|5llY1@9B7HO1Evzf! z49HilOGXI&9V+VZ$&1I&B$!s8Y-=+$7{VdR?S-MFwQMH}fuHBHKX7SU9DAQea~BF4 zsVXmkt73!h8`p(N!yZmr!{@}xLUwMQxB^qP;yK{;9(PU(GfER7h3mSM#Uei_iDGSM1d>QK8h{q*UC8nbak2qC!ZP7$FF><#}>J1zh>(BWsXsQ`>x>Y!U$2Or0 zfn3yXM#Y?bJJh4Ad_o@5lRbH&nad`5c&=w3bddScT$^8fH5R_>DFXL-wqNO?RASD$ z-fe%6!>n`XohhYXL(VMg8`WPnXV=9lZJNhR)mAMyQ$dC4n}cgMQf*IEeKVZ`YlB8N znIpNTEVjviw$qEe#8atQ< zR2^DaOsuo5GR2=p-$y5$vLf#cgt0|*x4-M-rD8dg2>YtD4|>I8Dis1G8hMX{%KFQ# zl<;xbKIon8mx`yEt*K1g#j_cd*1br1;b^3A$}ZZqL$gBC^1^xVty<07nX9Ls`OqMz z?GHZUJKU#RYPeJ|_;`Fob5Sr}mU+pBk#YDqLNuFv;bdUAJncQOOtgIpE&p%)@eNG|k25H4lu z(3GwtC6j?LQ0(u3WgbotR(U6bKjx{?#qS@m08Dq zkPoN;A8n;dr>Hgjaf-syGZ4TAY{UnL6s6TP3Vp;;cl?kR9374fUYf}%ytlS_s2Aiw zwW?Bh;XFmym)N1sA^hRjt6A|?__viLDMqAA$=Pd<4ISYRT+7~>&000BTlO6v?hDJ- z3EE($?JcDTG#eCxCyk%EvkB|dcKyedQgHV546hVv&gktp%%=Vljc#U>@Xg!^SS$B! zaj67Vr_zgBKlFWkQE)84?c4MG*5fT4w1pKG&2Zr7Jgm{!o^ym!LE3sg7po97Wm+6b zeR-l`1VrwM*N04O&+k5s$5&)$ACyyJZ{>VYUQSLz`)+Fm!wU#NFnt{=^(8HW3wDIK zZ@dJV%dXaJ@kJZDN+m0E^`b*MKE{IKF zeGrFib0oDWK36F*2%^z zaRv*-VLQhsOU1i!>8Rk|4EL}D#|2ua-29G>45jYDcO^2Pcct1@Qi@DMfm_+$)7d}r_S zfKNvarHv$Ne;M{+br=gbYo7RJi(8U^HoB~yiGWa@!&g?a%}Z=>Suc_pvD4BI+r9_a zzK~bPjaWQsV096cQ}GqPh{_T!BtrGtdLZrHJfB}*Y^&`v91qoao#QirHtqyF`VX2 zIr^KI?EyOw3r~zUQH9rfC+M7Raw0?>(T+0j?#v~NmT8H}oUDu@f{^i~s|NDsGO0*feBU5c6iV@Wl2G1-yjcQQz$aghL)=adClszUG$%A?4KPU)7uo$&6ti8Z~l&auK05hw)4DJ!uo{z?G9%w+`Lwo z`JQzgrBv3+7Yeqz;fn~*jj_CCg|Q{?L#(OYhpc;}N(m?9Hf>5mYo?BS2=S9{ub%!* zz%>kD*?l|kbMuH^0C$Wl2Y&$FQDIg=-h%h3Ghly9K%Aor*+2b zoF5-;61_E1m!J;3GD-Quui@VQ;_)iA5g9D1Qp>nUJ~PA{zInHqj9Y)}C@=BuRq5*O4vl`i^Y-&oe-7lyKL7P~ z8zU<$mB;j5+ZZxo^Szhwo(sh|h0$)yBXeD$57(uf^RC3sTs9Uy6?#vz%hGDH(&>$7 zLoZ>g&A?!uz$`k8g4!{DvoYa-zV_PdL~;P%so2U31AxleJ_N9Y?Y@W;7xCT|(m>d( z*MNHAyse|afetgm?I%Kf85NRjB%h>IK_mzPgD7qLx$Nk;Tlx;*a@5&P~K{d4)&n{^lVpb`cS@$$vjW zbo0^H(R?9mynxb=9qOI|djGo*#@W(`mFwnP|BD4cb6Oak;iw4M?8?7&2&JUX;cUQ`wz^)q2PPRU^p`3I&OF{1natol#fmH{4Vx~AKN!Ey z&z26iBGx&_U5{7)OXT{uagF!osYdWbEljN}q$pRe1*hPin$s0)^s?FSy^rP}IO&W_ z8(PG)QxYzRH;CtPHKq(UiFuPD!fi_qKPM=3RQA72m6)oV}`J@{4*Zqk9`l#l=HQxnE4q0$H&?wP?-^C%z*TjPXRzy-7C6|oSJCD>f zOTIiA@9m8_(^;@1$UIH?t2fBOEmRf_#Fx|&fqyM4<$UjZjruFp-^PiyeZ51J{^yhb z_@h6++T1CB0_udwNifY}-ZYGgP5V1Q`~IH~%ta6bNXk)lMm zTGKIBd$j)ZpGPG9^&mzS&(!08zWmRBmGl>_!Pb9X#D9F>jCl+orO%rW?m9&Mw4$1L z-!4CumO=@_{#;y!Kk9b)GlpdS)9qmMgT%8k$iMz3BaOBw{?22Og-2U~Kc8*N!HgC4 zEk`ZyG&*{{a9sH?*9>!3Rwom1p{J*K9{FGQ^FME_Xdd1cm&dWiNQR3Ci>lA#r?>uZ z9~Nq8-t(_ngC+mrHvQu*`!~NIMZaT_F^L}A_w2tP^56Z9Xw%~iKh>sb?Dc2=$LoFS z$NF@%3~S)apKmIv`$!A0K6WPZ!sn{Y!vVlTOSfO@O~?IEQa;7u1deQ0KS1&q-&<;F z`sq=pp8?fW%O0%)O+f|oS|8*^zC0m|PR?akS{lU(z{g0!r+!8jr0+o`V#5D*UH#J& z2hTmD4iEeUw;vD*d)2pkf+@|rnd6Iv0&Ol)Tr`g@DAD#t!+OHVi~KV7!9Tm)k}9S~ zZ7*2{W;%py&>#4BC2Fi^O7BulEPxB)R^`HfY{7H*~KpFB4EOx8Q)4kblUAqR!lf8Mqmw7>V=01M` z5qN>*>Ib+~=3;RDa{S=F^GZhI`O9i~rBsBUdt9Y;+|zxJv-8=xOK|p!u2#7rD+a zf6Pq&|G6F2xBW`_lpBFQWD*2}li%9?r{3MVJ&6#xDfapO{oV;Ea2BPdr6ma@fO}2C zV-?dHSiS;gF9moUzKQZr^#Z+o(*_ycbLp_xC^&c{VC1+!30d`&K~Jva0t9SRX+oYm z9ldLOV7M%`7mRmIf{1Zv9QG7d`|F#_wwvUByw!s#E?sDq*{~|8%W6slVylc!IB$*=fMc`}IH1qJH=hw-Sp-rx zDycaov&qqOdLN&0z9c6`Aau>vA3YhS2j;=A_zOosjO*Aa5LH1d!EB>yn?ep;BQMw8dMPulx2%<0!K!Dd&IaGNXtC)nUx`iWi38|jTBkO; zY)5$jDtGZApp9woQrIO45N4TAV+*nwdHKn2`L^Tj;V{j|9NfXPVF_v_Lki8|^&|l> z-)<|qtua`C8kTynq{t@yvf)aN?@B^TSw6o~ZoU2X5M zMz<9sB<3Y71l2Hh@?^6GMAAYIQbHUFyzi&oD;Hk?e85TYI+iHcp)z1ZIY@iho-pTJ zi(ghTsO4@Q5Fl9GYXg4}*+)Xtg}q-tC&!JfDj&6VTLoGG4Io=qmJX*-Pv*59$ngZw zq2m|+SOpJ&BXi)ir$w_yXdSWF|7ZDQ-gl|-8wU)p$WFHaRO@bCMdhE#k5lW1{}$=%Rb)5$ayeUqaIUwSXG^2Y#^w?Oi%bvus zNTXyRH2RRzSSBID7#k7c5fn;wP)K1fp~Z>w@3xU+4K(kjnr{g|IOw*2)$p{I_6#s< zq2Iy5Yt8K}l+)V7@ckX7N2Z|{wd*A4yb-!EEnR$p78L9BkkG(wRq3!Jeg`$Tfb;yu z=rEewE1XKt>&Va@{s5!v<6n4Yz(Eb%zn$AfR*5~bS_k(8FpXapGvAFjV890N`*-VW z$q)k5yHXZ->^af>64fmrgC*?OCCBDthXvOL$?EiB2DyWw{DMz-6Jg@(AF#FX(Q$h0 zw=~OxAEXp_P?fYEEn%ew^u7XNBm8)#vgXPSl293Wi6}5lC$%iSP>JslM#i6Y^u@$y zsa^T@p`j{e^cJP>RRzxDXaix~nWKlSrIuU^ly}Mp(8*{gP=a3(yH^+BfmX(-*OU)@ zNw#k-c;LH65eDkz(u=|T^%ReBb*nyg21|eSctxi1?$PQa`5XWOzlqZ2k(L()r6k^_ zz$u6F_m9cO<6|X^=2Y@iI2nTPFKnj!3P(ShIlB9Lb{OYC5RmofzM==R!ER#|lLUzn z(TTAczPr`JaXB1e6gb@pv}mO!q{;lw8z$bnm8OH0Q_pMdmzR{OHHEAbWcn546M{SC zlr{?8m^4c#i!R#P|B{mbtkLw&@7cu@n^oh|QZ!@!$$)z0w8i3?4yiD9YVOb8hQH$G z;AHk=Wim)Z>n~t@t(%cGO>=0I92Qn_w^k3^7FYNnKmFue_ZR$mzThfi>`s|3ZkfpP zOlhKu7T{_x(6}`K!teN!9@?29^O@0$!NIM$^iNBhy}CpbM(X|l^>hU;Sc|PN%zYAW zJ=X}0EYM?2D?LezpXY&2t=d8r)1v<+)cxmAxoJmhnCKIGq&G}bzrAqXdA)e5>cqBI z@4TsK%B3i-1DrcF;!KJ(H4Bm;8o!`x?b9SHyf$B-E>ZU?3*q@QVB8Yo`AqtXohb1N zc%H|pd1HTi3;JJ-^3?6^Upu%JJ9vn+`v;7^Ru=Yw*A=76byuS$0PAX_OJqbsFF%ES z}1C%?Of1|SNbG&4?D(4&q7Eh4dwSv2L-!bzbsKk zNBct=Z1E~lex<4tY~6=%F3Df?8I#a`Pa?YaCEWlWbLQ|T5b=~)w`s9R?>vt&=i7Bn zqZ)aW@?mfESI}N*(s$L+IxXT|@B$BoMweS@L^`SlbAG73H{(93TBw~tBPU~u1mO=0S;7hNOdqVzG-pf=;WgZ zlMX-otJ6ur!JQVFo=SK1@H$DSLxF#{Uq*;>_?7C5-8TprjNeMAUaBw-f}Up+NhZDg zJx=QT_3+O8X~O)^r}v7Z#^h!gS=Hl-qj@$4Br9vQs@A_Sif~5W>wG~}%-wmuLtNYQ zP_s5G^g|~H^%9uDsiH3LxJ5eyC+~Yy^y!oT7)5O&7kRB)6~BCnI4#HT@&Lyc0>(Vc z*QUDK516n&`GpoS?>)we_H>x9wPnJ;f0rYn`p0rx8?@&87r)zZN1I#XyJF)4Ae(}= z-w#x^@T~kyr4jzcO7gQFzK2;YRdNOMTBn9R5oyjoeH1a%N7UDD)wWU$R3)D@QkBy# z$#X0j&9jYR72Bv)ujmCEfqS0rK8aXKikiZF^E7y~w>esU+^+T9EKgW~a@mb_zWzIf zhI!}LD-f=kh`rf*??{Ri)U!aE-Y21mzaIJ#ZZAh(rfUuEebD7!_U`?eGw6*jz9XVn!}C#I2I8of39{s%}rW(iE3M7lX%Bkn#R z$*JO_d#=BB|C6~QeNwcf&Vp8n>m=cu-X_+^1{iJ!~ZXF#x^Dl|dzkCV~5}n&LPcDJo zjT)0iaXyE|7tL=87#+2&TC%woy0txDclL#Wl6AI_RqMejm_x#h_DERplkjmU;Z=T* zt?ne|v$$hOe;fH8H7Qf$M{xaUT8gYg%gq;y-!glGIA zrSWD)_a8Fm5hJ3CdEZy-@5flPM!!>y1M$7}Z5n4LriM)qZdl+B)?rku#^JC%?D@>F zailQ5KHqBNh3WdE4{TBW+!z)yc4KlpW+}>PW`bNo2c7kG$z<5E@F8}3+c*fvlDIID zm7!}g%348z)FAk{b*qt`snNcPb?DJdUQ6-;wZYNX?F)WTW6&pyxji4({)MEdbDjJG zGjvc3#vb6OD(syySZi17X(sf0GM$Ab{u*01jMwSh$DkuOMs3_?W))19RN|Muk}7^0 zVm_J0A>GkHp?2s}@HSbG6%~}(IvtYy`%T6eP2V=-Lsl6GXz;!;Qhp7?kd{AeSGE8W z2U+YGFh~nM$krr`YscwC%g9NvWD{an=HvUNE+p*Bv4yd7y8F6#xzA=N&c`kjhHId zqs6g3aAc{b7&;#1HGwEKT!T?D1tf4qiBQ|E$S*E9k>?D0)B#dphP+voL#8B~_;;VN z?@`OsBkX3Zzdx6@X`3x(c|n8A-xFHLjwUA3=(H{uFtssyHk2;X#=vwO@Mk|!8VjDw zuJ-0I;;YLqd2oK2_@QpmdZT9aqN;ywJlz}2{>~nKVELR?@dBPw>ovbV2PQnSd}lri zZuOc@)u3Y$Oxn0IzY!1nj}1gB-aSH14}Y~}t^4`Ey}|EZ`nCQPd-;-@_8AzK%+M%< zrrPfMf6A~fYpJa^WVf(dlb66xb{q#;?F_xgVoT-KLowPs6Qg-~6F|V(u$1t{jt{(F z*RD{^OYIA6=;c@=8!qCCxn0)!9-V;3E)4?fe$Q5-{9OH39D|pPpmRtS&=4f{^Iw|2 zXQ31e_0^*-92g*B+-z{(0UEhQkE)x+wlmAc_C>tyRq_$E~J*#S@wA~1@#`N z+!X;v(TOf8nuVyc1-R<4mtPm1LSh(r?;-vD5mh zi@JDtRPlJxv^%ZR?Fl1YEe{?a5TG8I^#Xb>$I;s)Y1)?VZ+Ved?21trA7`!Tt0`X9 z0%n17vlXbn?0^_Vom2My^Ds~)O%I_R%_g+<802*mJBoSrAqUox+^~SDoCp z=l$oDoFg?teT&twcJY z?VNab85jrWL(EpXKDo{~WLoi;h;yH>6si|hl8vT+RQ#eWt5?XfBoJUewOZn^5&3UV z!7r(zGbHpZ2CaJbpQ6&6-(?0j6n4!R`0C3*G#E%g$Jwe{avrD>sqkpX^Pi+jRcu~CdjzyzOY~$ALyM30ut9PvP*i>VDGQJP|OM!6C zeB~Wi?e60`w>_PMyOa_zw<_qp&d|^VkLG~Lch!8MNN{j{j42oPu<&&FER8HSXVrB3 z@inuBNCKD`ST6DzdbK;#jF`e}fJvBg)@vW^k~U?rik+|LN1LuLL}|u)9giq)qkNfX zgH_Lui{iM_eX9_lp_**F9vFLd4XRr&2Ok$w4h{{i1u>)g}y9EIy{2GUSmd^ zgLz1C8hWD>8!)i!;e^}xF&2L45iEZ7do|qE^!_K5L)M?h>@F2{7ByQjnrZ1BbbS`5 zQT0#ewdwn9H&7m{CSmXpN6wag4f>}UUl>)6GHa?N&LIV8&)8*!t*4o7x9Y*>UIP*a zSFp0>U5Kvw3SY5%Kq{=XWQ4vDsnt&SFIZ6K{y)OLIxebh{aXn zm5@@p8>E$H0F{yuk#Okllx~o2>6Y$6q+%wpdaFUZa9_d<>)7X5kQtrBHVsHYa_Ex(FttP`(2lP2Q zxQfd&1?~w!d3J(*yM>q^YWYge;tq4fTdWN?7Q_+ehFt-08jtU_FSXB zVjX|(JeXnTd=y19O)C^AYgn_H+;({{ahX98oHqqwjPaBLBxS|M^@q|6_8Ir3t z&lCIeax}kfq9Cwp6~J>k0P1-L)^IzK_q11*%P6!A=zf8TvX*GRFkQQB=2uR4hK=NB zhJNXA-MBT59d}UD6&Pa=UEDw19G46Cd!k8|&+*2g1;LmK*5DlrmVN)w%FLh9UNaw} z3GW?CB4J6HR^I+m$btS?Z6qCg;tfy{_dU>@e8pY66UXQmZ|1(W_zQ%>l`|Rafe!Hn zgW6T7qlQIVL1RxhO*e)A8?^rg)q?(4Nj|yXPP_Na_Th8N%%yGlXby=Y+lxz7vL*Xs;{d$ZUz&cT&O?Ft)zZ8L6=gTqRz{AE-u@NIhb1sd!Co&u}* z*!{00|MlYnz~yZ>-a(%56SJp(eSdmvpf!300Uf*myucucyf%jD4}nH(1-2jmU(Zzt z>LSDg045=;AKfQ$JNh?)?V_G&Cd!Sc$(q>N=6!TMjzXWr0jcA|cwh+A>rGrZXv~nH zp?nEux&QpdUog%AT?p6IJ?t9#|G{qm-=70K z5&&}AsyKfCkG5Gb@YQ_Ig+-M!Qw|L{3veVD7?I2Uj&(il`~ zQ9VrrGRd-7OYI+?Lv>fr9*}P+)w*mK*3SOG2ZQCoeq0L=3!lDM*4u_ALiI^Rth~mD zkFfyPl)3svg1+oz%-A-z`)O74Zc`C`^l-B-is0Lk{nT!Ux%MfuSyvZ@Cp25pgA@VXEJvm%A#aHZ*nn&1?S!A8LP-WDU(XRZA!88GTBKiPFM-)l`jwC0 zf`5NN9t(n7hs2^$P$Ulh71<%`9rkW%*C5h!2wWC7v|2u}OnY33O#Dn7E#y)N3Hr5IE^%+mW2uUb zt0w51kIA0O^f_c;km(8ZdxMDC12}_IyO+|_oxg2%(9SW?1+8KK@=Dv88otONI;Ver z#F~NzEmi9LRN9~5_*t~+b^!e=@5U^U2J@~#8#=ja3iAXTXSGW$<-R%4v-R65en1aZ`A-t@t+v$u$6od4ZjvC8%t{#fg?<3oPB{{G z9!Ur^k-1&Mxt5<3^9R(dqX3+D73wqa8&rfG*TaCxk#7xZHEtBRtd4?OeiU4*QNSa< zUnOefLFd~P%)9Kd(0uuDt8y;4sPSBDYqB(pA>Zv4NR@s$*=x1q4N4Fs03o{c$JW5| z9&j^=qW6shdCfHdCL4v&aL`w2aldM;P1!EiUf6R(U&%TZXJvI({4Ja z(3Kx4<`67>^2!mQuSj4Otfq0z8wmcj*2aqT^%3w<_gmrmdKM!oWv14cQHw+BIrpc7 zitpIeMxyQGr_|1urs(@kgflLPxcyw znA<&jf3m;UKaRnldeDB5x^{7VkN+x}iuZ&pWU75t`<0=#^m-S7%I}zxI4VZdy?W|F z>1CXq>%*rFV9`JGzx0ttyo^GUb$z-E>?}3m)QLh{mR7VywtmI z0+zN7|MIcyC@Ll1qG;WPb8K<)lIJzFu6?_6=?AYkJjNvB&MM}O+9ML?1Nl8i5!w2C z@Xo45ZqaBizcL)|hG0Ovcc!5+a;-r!Mt2~CUSd;qP2j-N;(#lhh`w6~)MVo}274>$7@`y{{AgcgE2_4=wRZh!AJ8lVQs` z0w~sHP5yG#1|}wP1|VteSE2r+CBHs^-KcXj!!{Pt;L7DI0^vtWEZG<0$Dz?mUJHI? z%Vn!o!1Bsz=>z|mP77X0Ea)MXU8u;Q9h!UX#}d_J`R&zj6%%hUuVz)qVUurP8DA25iO6@b!Gvr^d| z2yza6KWSElL64Yx9N)9P_=m$cGupku*dvy$C;eKUbtP!zo==2#uLhy%KMmjNsJn8h zdtA?>uWot88#o<1{<{2-!7U(Q8)N{0*yU#UlTj}0SDPg!om7H?GA%{37720%Gk`-Y2GO`A^Ww2*9S`q-E@2EMJpWa zIGjNP2X14I4hG`5_lkv|@4&%DDZ9qUQrUSSt?L&m>6hir7k*=Ndn-NV5{9A<@_T6s zUxAb16C|*^?8n&^MeIdNKX|#_&@b27QK`=oKA17v(`{Ny!!7zgpB4kpZkXs+a(p&* zF=T6S8(2T7;q)CBs9gKrJI5+Jz2eS)T+6<~-_p7slfE>4E{WFd%F9l&8{3Y`-DgF* z?CQ{M-B)eI&y-#vnaxSWS6PqgTz2O<{rK3ydMRl8#2|Bfn!2~D@SHrcIYYN2K-lLx zP^LxOcdaFfRc-`RJRY)K-KW^wr|2*`xa>xHNFt3XJgaCXZ*)I=d0%-A%u#GH3s-Fo ztZdHoNWoGf=3mx$Au7Ww^-*)a5#hD3ha$PO^@9p&^2ibOY%a?&p4(228jKavnOU^qbwd5$lx6w6>6Aw?eF&(FM=###mZC?)SOCWv|K-FZEK zJH%tJ@TEv_*LF8+(MbEH0jqp$vx;D_<*o&1s!KOXCmoSZE6(Gqg5K5X9UsvpEcw`$ zNdg>)`dsH=vh_>{;)YCVG1|QpeaMrLR^@}#!}O1l=hrR{Jn3?xyuluI3e6H!G$in@ z<@q-8TG3a9p#jTSqrRU<(ofWw^vZ}|$c0>x4|pJM5x>Ys2B`;B7HFy!(IT%(tXeHb^Y}!iXp}H5$^HD zfKb-rC7;fzcY>S$*;BR(wzxbx)$sAF;?v)+?wBLm?c$93;>pH4w z;o-6W9ovEPGqL;{!hnWvJPJVz;r%Tu!s#?23l!Q7xn5#bUyR9rQ`E0JVqpa9FYuv? zqY%{z2>9UvK=8A5Yn>-kZ|b!K!|UoM%PgPw;Zw>}FJ=S@$fw@J0QPKG#*g=SX7B<( zGUDIhZG@=t*4i2=hS-t)~=U6Gw;QJ0Z>5%KG ztgA2eUa=KfscvdodqszNV-Cl;zq16tsA%N0ifeUBpLcJ)q6-4rgg&hm$oCZdzSk#1@%7 z73(Rbh&vDdJ2s1-jlaA+&)39$X+o5O?w>#iLO;I&_NGY2R{SlT42$R;&06Pzc$R&u zSN4jNhS!+SrX>M z-YBW{Dq?Vwl~h;YaEImrrbZIql@9ygR9T?RLpzu3sCJzL>emHAp zTqy)KOyH77zE;2d(*4dLXWUD-1jx;!3z6;`RrZFTMW#Gw1@QQHu?|5Jgv%}VZA(5y za@4`*aV;SWK?L+q=pNhdEM0aM637?5%m^_HJ+ zgNIGNWGIPyt=$-GBZY=Wa%%eB$NRJ>+CC++SbZkNPQd7nZI^j!w>efYHF4eRbu;!F zEuhHkm^M@ac3_Y?4d*9pT}yNIqq>{Zu(mOe(H}aeC?!uBW$M>)&A{RDu5|bqPfh(? z`K_A8VACz4#nyw`;x-2xqGHrj`1o0`7yqvtcnoQ~h$+92xPl*yDu}tydOuFW6o)fJ zmX}_*{mf0;QrR{Yo|gVrA{g?C+mVtrg89%hKbOFqTbS>|g`fnal39Jj`;6Us zi`UJ(;c2DKVHDBahqdTd9(4~(|4%+$1>Noe3PBO1=D-u0tU8@>GQ|aRdfZ>w=AJvo zZJlNDI=cg;39hR;TOa4;;#fP~wFUj~qLZjv({SXV+09Fah8KKAnaerjxZmrDlbZla zCF%R;(tJMpYwblw z^Q7TP=_Id0`uY)!CLtR*Hd{oN%>yJgwEmt|w~&&X{XBf71t+-mO+ngQ>pF>u-U_lI zSlfulXs@-q+5!AlE-a_p*PLO55Iy(!8U``B9c`GQ&sAh7B+ALhx7l##RJZhd!A@|Lukn!W&0|ZV2FvN@(`ECcT8{Z{p@UlTfaf(~4yB`*N8NkVLtLlt!*-kGhbu** z9rAWg*Ug1SYVrG>!-g|{8ZGlU@t9D){sfaIFG~V(tLt-w6Gmvz>u8pM25i{t>`LU8 zj{~hgu;N=PJgp;Y9{Zkfy+A>wQzu`d*4y9|@dur%>m#+Cfc8!M%|M6b#DxSOpb zcTAQrjL0>1T`M1S#NbA`<`vfEN2I+lda>Ef@~hM+x(d8_i&OM0V52Rbtu zO@cOXf~RO0sC6`IIq(X^2K!QlIbwzj3*YB4+gUf>e7>DYse01m8g1;D-H^_LgH8sm zxCSF)PI9!jWjuN`-)_)m&Vjou33mgbrAx$MrSBcnAmg4QOz*W`EH7Fr;=5|tzTagB zW?W_?f?@<)Dz&qjN$h96=X6Ka<@x8j%a&%*T&Uiofom9*m^y>eYp-fXG(+N3mZF1o zm|F}??~^k%8Aj=FDm8Zyq-sxSO*?Yx^g6=NXfer(XtRl%E37YZn7zfKxrP;g(}&N< zzqukyU#@+S@j`S8&`%@lcRs;}0D&eQSjERVMvh#O4foQcPE?mlRs{p#1nbSujfu#* zmm;+QT2Xb}uc%-JK!`3&y zF*bsjj(la6NxN$Qw^+95(71x1j95YIr1UE%^HbDy*z)8PPsFBEW_4XFeB#+vZ{9Os z7g(iJM{<;K(S@kA0cT{@I(Mt3AItoT1Z3P)`i*vC>v0XfipmYCb9mC@^mf17+TLIz zp_yo+OdlyCun2f0o09NF>r@t?X|Krg2JWY);eDBnQ}>kCrj3mtD@jmgMP7W2brLiN zpPUrFZOH5ma&t6W!Bi#>1T1k0k$tp3iedUx)h7t173DuO+>mu(*zcDJ5je>%Yi~j> z>h3~H)(GD~444OxkTa#67uM~`Qc=rG#w&W!ZmcbxEF<2Ti+Yd0E~VjIdg{SZbL%rL+*`^aBxISaf z$#5`)gMABVmTkGa<5`~Nf1AvfBJV<9y{GV$9CPdrw|b@gwktsC69DkltAODE6(wp`0SV%|0+{E{lesk5w9#dxE*H zB??JH|EgDct8j0iau_@G6B$WUpS?zxbbOz{26(ZEX<@ae7xV*r=2E4D!`FJ<6+?OC z?7zOb7&CVO4Qhl(&qogDhZN`O8RO+NsJ>hVt_rMBd_$DDUYXHomTL7<-FDA&;_IKr+XMwNM`e#o@B(7#Nm~G z1^UYhiI%rU+v9lDXsym2Ft2eH+Ov4I(dw0>>DkalUr`^xa;0J-(&h!^Z8l(Xb?LCe ztMa;U%Lg+cCqLBZ@OS9%Nx_W$2M2=yCKTa*~q`2MhAMPW;i~ zD|UX{MsqPNo)>p>CWyXE+Pcy(?mqD)dQPOcS$(g#2q)%SuQ)m|?2)l6&BhD(w zf!0ga=*6eYsadSJU~AC+07dS+ONpsc%~K0h_XKtaF=DDBCkG_pekdEYa)4`GucFQ{ zU`L$~-vj+5E1YwZ#RYv}Nv|K=9A*Th_AyKvga*K(ViGk~ZpH0wx8pmaACd?WAPRJ< zRaO`u8VXo?5&i=3xA{r2U&?Y|;N>{l@4!EK6p%4eKt5CP18QMf)ad%HS@>e=_%4&3OW%r3B zOu!lCGaE)(Eyuk z%vs<^1!^z|vA(k%=@t+YmxB&8+tg%Y3?*`Ym4lo~bg#+Vx@*pz)Gy9rLfJ!7Sb~wz zbJ)9~JNSgTAiFWlOR4+b6ZOQLZnFJT36tMbL&S5u>Mvq3oS9s{A>SQGvJVKspSc?2 z2^We1MVQyL`=+0&KF6mJ`uN?FM-8I<7}Pt9Dl_zF&5^Y(w1(EXF;C(kBF9YKt0>fPQoh?Q_yV-njN`5PrRd1M)5`UeMB|m>v z1Tf1~0Z#Gd+l|LJ`6xI!$L}QAVE_fKp^6IFtv9PhoIuAhHBA2JKx3T;=rl@xIGsEyGe}&Bd&^qu+7W!}a`TfU0I;)#BBM&t4Hw z@PBd$@=Bd_zjf*Tl23wOkb7lT{L#z273-sY3JX-Iv0Jm~7OaA%HAoRPM>&Bm=v3A9 z2F9n%jYR!7SAtO(gYIWyzagX@IAhftUC#I?OWjnUuN2n5qN0lSmxK4h4`rNEJo9y{mrn~w8qQGlajym(< z1i0(NN;ZD?2{SBCqO0!C<%uYuG`b4q3I>GVhh zTu)6x1lFFufv&qb42#)|IfrxJndu2|dU#Al6J{1eP;p9&UZiVOBqzTRhlx`BUh-w; z`}XIknH8`1i83`{=2JGudn(&i3Y@~MhF!S5&KuYWx$8D6!4)<44-oUF=}%Z6?@tgy zT)zA~gwzg@o1NK~D@|@q?)ZkrJ*BHidOygWhWl++c=PcM<8~Zw-SH!XcD%!Yod?E7 zwKUq>DJxRKG>OHprn-OL^8jo4?vqXdv+nIQSVcphe9r-yU!V8;5;>Z^z<9(j#cpid!FhRnLHo zf{};QgJFgaEB|=)Z3oi)1icS2z+=ivsJ{depAQ@v>RdJsaDF{uWId29fnKR0e^VY8 zQV`yeb#8QE4Jm$S^<7x{`4{>Q2=<91`Lc>eeCxNrPXZM`j| z|KhlzV#5@KWuz)A_>Z5>O1){MaaYtWP5=9pWfXvZ+I6Y{5fA^)Zgjl*7L+A<;1tUN zl+3aB!eD~iE09B_-kyKsGvnW1>;HZ-a1k5r^d{uy@9u6mNfodJ8U^=l6vPA_=!^P* zC7%PP`fT1YYybBWeT%sa{j#%m&MZoaY`>its&SCL_irsbukowK2sO)s)#kJUu7Vt zD~q63Via5l-84?h|NH%)a1#QOCNx3-K>+zP~GnPC8U;`CPA z2NwMxudXK-5N9B^GW!Yvc`_vAI@+3Ei-14zwZXyE0YMH#^@OCr@$nR1F5f}^iQo)C zC7)z$r^1veI9{1zPoYrseu?F;QL&m`TLVeekwwp!9hvc4!rok8XWK@h4>z-}5K-sv zz)wDUpGgJv|D8}R4xt3TRYkj%4q zL$gvZPf{$dO~J8w#VtpZ>t;rrWdBq(TzW;pp*{)|qv&Cg5zj1en!eD|#;EIWBZif5 zk_dYw=CU=FVPNup4Gj{#asg{E8^hv5{P!Jz=L6VpV*M+s@r*Olq2RBcM-4B(u@6XL z*+?1yl7m(_L?&I3z6tc(;!fBT=efpx@n?9BL0-;5@#nk)LPiJv=s0fk(86A@$agjK zT1IO4*UbY|@!D)?FIM0P!Cw`>-e}MJ$A==K6i8m}i_0>o@&e&5pas7Z6=V8W4}rH= zAn^Z!g3z3xC%SB^!PA3b5hBVr%J)N`3t9vNj#waDDC65bM~})({7wSE!g8m2n|qp( zyT9o~s-8N3%WFhFfk(a;T>08HbU zD5K%bXEBYsA|nXrf|#G-!R}-KWFa}yT!h;*von8EK`=t!4 z!s^Gs_dtq}T3CaSoI@M~=zvHt+EN18c{Lyu6~Ci+_$vJmo&gAm%2?6?tQk*;umz~) zYM}SX3HCYo#MhwpSQy?F0!spdx#YqI^h;0(Ki~y1N(t>?Ay_Rg%>r$2VJMYQ=2Cx> zm_A6vm8I>EU{uV7hz$(zZW4J5gO8`QQvTUML8<(wdL39bJ_A#MpWS=F({q~?G(}rR z$bR*H`AKD$iA5rPoFzH#k$aAIfCGkHYzx+Msj!|jdfas;ddOb$VboCc^anRowmX4B zdGpBe=l6%@pmu=jss}c%5Ua?E+^Y=V0ivHA`8DdurObh#k})}-w0e;Fk`?YabHfJz zT2EjaTO50O630?4TFP~G*f~dR5Ci7Qd7!#sNP0Bmrk8}o!qyW%759EGdc`w#y%!vB zn<@mBM8(3pl8%5X@Dki?B*5U^buN@X{s|~_ssk%^Qm0JDRN+Z5#EY$d>Ha+e$$sPM zq>qUbk3)*VWl`%S)Bhm}8|;95C=74xZKZ2sY^7)^ZPA}$6;u!3M_!i6NP0S8DjXi7 zJsNudS4rB4MCa@KL)SJC_Z5?Nb-QnbC$&c1YiC|(`3~6*j1_3FsLd2d9j;9`zTXi= zGdoCroWFEDtSI)7@y7J_DtGgV|J*Zs)LQ;^&p@|^Z#}E8L)}|mUrf~AnI^I!?!UbL z3px;!{PvAK^A1Y89R!zA(p4Ru8^h;7&uWri1J9XGij)e?@y78zFfTlu&0WJXTw%%b z?YQ9}=^YTa!Lye&zj!J<@st|j(V$}x_Ri(9>y@vkA$2WwXAPdGNG}bwM^e$zYX~^C z>00qN1-v7G%T4EJ#{sNxi`LIeR&z&)z?gkDZs_mZ2{>jV#N zs}`L_hL`@cN6-Z!r(~kX<$KAroD^dxrqe zwq}E&9E`+7lwrq$ySUC~o?iu0DXz#4;0Vm2@dvp}sl#9lvK$n?fsvqoi>WMcSd@l* z8RB(`a$E*&>U3K01t%r310~G|LIG7>le_q%Kn0n6)#t|0z(F4zA~P<3S0(4V?~k;Z zxs+;7f7Tv)rYG@dU)^AoTz(XPJuw`ihzb08ygjF1ou7H3dfp)4&_m(@R~7Ycvm0>Y za%vj<7zzwds6^o!P{&tGGevFW$_`rpJK68A515x3+uJS=JX!4^#Y;*mi`*3OUnXm& zufU+zV>BC-yyZ{#N4Yx%>}G}i*p#0IAQr)iJF364+Tp^jqlzhv$tn^{hjVUqsyAq0 z9(lPsYXUOru&eJ*fyI-~@eUcPr*j{w)+2Nh@7>@_*AEx=YfkFp6am6SvAeow#UL#KM|b(<&2~f&@GLou4s^LyF_**< z;?r`9Gc3-$ipuzOR^h?9O(EHi6GtFAE_q4@;Ox94vzS(Kyzw2|u6!$QO3dGDrWxsZ zBK)YIEC@aZF2+y0qz}}pzRF7PVbNTnb_*B{ng4Q?P)(v4QF;;~z~TF~RrXU03nhpD z+ZKe=zl-1HdnYIvp6(J~&l57-qu%%`(GIGhiBxRexSi?QpFl;`&ET)}xbAv*pwluG zaB(GLChi_;Og}#n&x-u)J)j}Pi-ER8j}ZU(878#AW!VSeGoV_YCQwF0PBlfzfS4aB9BgQ9TIWbl3Oh-8LX+6pQh9DddO3eb zWpzhVCBEx%A&T0t)9f~53QUUTsxeQ#0gn0Yng!tsk(0egNkK(&3j_L6S;bgn@%ig1 ziVuDz*RI2M1034g{8qukmWe?jW8^09_yt;e)4KcPDMDn5&>FJ(){|>KVO;8}joJ)P z&b_Qr@M&dHZ^S-1Ci1{@CBKF$Fg~RtY^^iQOJ;e0CD4@P2{QJBdK`Bb7 z=4>sE@jHOg+2bk(-g(v94O?8U^ogGi<-`=SiJX~*ttp+Ul~A~~uLp1Mmp^-quv!)P zU@o11fLp!LD$w;=VHB--C}8ty*G(_z6L%Q~Sagl9$jzKv3kF){c;TEJnbRK52Ccoj z{?I1B=?S{=abt^(Lr)=?UVBnMh$#5s#a$ikc8biR7$5!U{s!Nz|2mgwRktZV*;J!% zSpYevx5u`DNRr||Eu#LY)ZK9)t#ntsV+W!R3dVi*!KnPMeE2dIgWPpzKz>DOPkq7J zSiF3h0UNT0jOBW$PtF43G9y|(sevqyw=7s2YTRD6B_suVg z2$%GcrW%-L0y{CY;$q@{z%MH5horK3-hd%6DdOIZcLEIvW5Va#$QV?<$c9?#U@hVA zUseoq59pn!Q2Gu8QRSRCQbRxAb(b`O6qKJmcw`#f$Fh6b zkBz4E-zOS2Kc-YuNccs&__Tsn(hlm_ER)P7WEieY~ympU1glbVs*_##<+!>q( zQ{*lznmc1NzU@brK{C9(I^62YymUuR+!gRp_aqM7@*7Ts^|deCS23-0ijf(Z66L>B zUd|os&+XWtW3QTBcVmA1ZQuwv=74?4^ptf-Mn@4P^fgNiKj&CYn9R4+a%l@KY!R&O z%;ql_wARFhx;Z#EoK^Xj?zehF@+cGQJnIRDE4{FH9EL;)eUV4R zydNU$Z2+afo%P~LAK5hlB8TX+Cj5H>@(qA&Q00$cDp9#vaV>5C-u!cKPWR6UNdH&B zjnx#q0Y-r0IL}nK0bPOkF-2>+mi`x63d)MyE}?yTHoeRk_Oc|7Vcf&oJ%SuD&emoY zt&^G;-$7JfHnrRGje6Zc9TKyN&-7!4abynbPxir^1qhyiU>~-o15RlV`;|c0ejA{y ztJk4%ZCB-nV&mR2a-x7iDU0*_#Oh4lo9xi~q)=0iun9AWHo46xA;3!+o?_po}0v*wBsM13hM_f8-@;}J}Mee`Zdz2 zXLzZJD8%W&6A?zs@`L9=NEYvU(iOSE^SzZX30Y@U^HWJ}Beewt4dYt$=9Bx$b`OnO zKpIv^R2mC=;UV?&#DQZ1)lecPa0m_|o$;$z?(v@#$Flw1(Mo*#F63&l!0I?TYcZOu z`VxR0NWk6wRvv`-dD1e5b&_Iji|~77B7KWnVzQlZoMZ4^lqs0ZBOc%qM20E+I>$gn z?;~}-0K_>5M_8b$Hu<=f(fc<$Cv@Zy_$urVCCwZ-nwu-+6V^GpppYQb)E0v*a2rA8 z*ZTm+(H6YIYKEXU80kj!$A59q%*j&t;!9@WD`4DCP|!b7vITJokbiIqtd}B_>Vmfz zOXP(`M1`f0c0gqC5|G!mSn9GLdx#5yt!QjK6A+#%bz1^Qbc6VeZc72ex9-}?J`8AY zcX&Q*ShKnt#(%x)k)soICRu$FN&BeZs3YKyg=9t z^tPDBhs#>$JwpD8=&Ky-yT&y*JJHQq+(Wj422XAg}#oR2YLPa<9>^R`<>OZpRD*$#g}(l>Bqip zwy>(hZqD&Q$z8p5k1Jh36@1YR999L^mCYT(w?r<*8vX`00`|DNR>{7kf{+Uo;Vq8I z_b{&AfWRC1@ntsCd{%@;LfL;MHCFu^@tVYVawrdGa$CusUOVpb`bn z_9_*%O6h(%&R4RDqDvlzE0N#B>Vn?!roow~^gq?1HkrL}ZkCI6^}yNR=eWqg4klP0 z_UsX|DazN~?DyeB-LqBI>E7c!4adf?Vm_ZRqmS%=Qt&;V$ebyi3eg2K5BOnYq##X!iDs0NWwmC_b z<$6_S7g~=$kL=H)nl#Bqg3v(KiZnA0dvL<`O(V=LM2U+m;2cG9tgdRh(0Lk3*pchj zJ;vQR4L@&k`-H#YvTsSNPk!{Z69w}|1)~9LF@`97`TGbycT?)78M%66_0=dgg*3qe z6|p?(tiCgAoghG?a&71*Dc~HC7P1;JEX&zFqscwWW_x)ye zw}}fZ--y3J2F}B6%fd$*9XDXZh6X%foPGOJ21M`xmR;Yx0s7*^d+&;@6F(g+8Wz!; zZmr!FbL@FG;V*HN3F8{uk@;x&>n=D&0U6SCv7~au<4-C?h5=~?FftV2{>tnzxiYMH zro1_NBDuMJvb*G7qT)!RAG9R>6Cg~PUkGQRSrsFh@>TVgkP>2JE*w-eOsHou-I~bM zZ9&vAW$-25(36_<6=DXS#@uROWDLGf2}i>pZIP&hhUyQ{h(-?oU|O=7^nD_9RLk6h zxVV9(IJ%XMAI4jcJM?XjqD#YXv&7Sc6$wlC?m}dAzb=JwtAAsPXsVp!9roG1==CHY zV918$sv$XGO0Eu?>3tB*UD)zB~?wV>B@8*PlW2gd%}pePZLv+yG@bf zS>7^})bbSYv(@gOduMATNrwL#-~US_wDAiJ`!}RIlzx58z6z^Jqj&PH^3>@ zN8Ca-y=dJo1ExO--xa)gD&;sx5M-;SF!yDzv=wAu2KXDRK8~w=GhVf6?~7ZiMCDiSr9iq`2s-oco8Lnc%!AbJWk5G z)1Z!xS%I=ZbZ4vXk}PP5Z~}h{RnFO1l&W%rhA8U96dgmf?XKq}^eQTkKj03B!JG0E z;0&2lI~#57wuqGI7XP$p!((`>a7~BGMW2t6oJuPh0#sX`LL+l!1>rKX>D_$ zuGeAeNSi3)j8DDXD;5JMT?*=ITZ8ITmT7%(j*X{<9U&Cc*6WaS7da1YS*1#WGN8vY zqCRH4yDECc>Ud28b@!{8{ma6=Y5BxL%hLw)rw>It$>-KZk_spRz?B1IZja)^lyzB( z{;IAkbpt`FBaojvM&xkhd4@D6TYqA#UZNDwqlH1m2r%{PsoFj;DDZus2e zOM3m5bO^jWxJdZU!x*Kgor*63*QmPL`AZ*YGO!(d-N9-%-*W-gJHP7fBy^ru|5V1T zX!m_B63~b@_EXq zVAk_G>ION-Dikcv?eo|=n0PaHRnrM&gj!xgY?zDIZMuzab*dK5QeK<3oD2~+d4~iv z$=;o^R3OJd$j{$$OYdjlT6G}JQ*P9b^hZf_fW}P!=7W=({(UD4hD_QH2onhh_OC`W zygJ!hU;|W_Owu=6&M(1~r^3~w(fw^oSHbQym{mg1D?Hib$k(FcL*ZvBzQbXy`*v}J^P;;-&m7g?PMw!)QuddiP~A7*Sl zOSU46{0yZdP_?<5>8X@&Zh7#mB-}v69l+BT(~GYLP0UC!b3)Gk z53%LasUsu6Gqp9ZwNr`VXUQFZMlg;|RbTv&BZeZ@#RfJPMMtrM@9|AB!L2Z%ca3#A zl)z(Guc4f4g9Y5MJbK7+(ou#O1B?L*H%%{v)BH2^>^vx(x;#)$a?@F~O^FEwKJ> zpb!Lo{c#8*sVt#A9X5*C=kT zOK9Cj!>M8^Ldw8oyeMtyQUG9Oi*GU}-lKu&E~ndYwCW}pXUVSb%Y}f0v{5Y*46=GQ zP0s7O|5%XSGe6eey?7PNrM}>@0^Q*~ozUVIovqtT(OWI`ECT%9EtFP0v)t*oY3_dP z@euU|yMszOjylFSKC>l2g5#0@YfX`^5V^k-EX2)8PB7yJ=7=U15wG=Yz>KT$$5;H1 zKvK*EFSik_i{^o%7)B$RpF0YW?-*o@$i`{hqQ2*3Sg@^`I&}&A|t@PF9vrr@$x(Xpu%eDyc^!dakLcx*Ggtn zYDseq6b`ucl9aui~%Beaj0}DHX>y+AfDyb+@q5`N>b0OAmFy1}AINMCEX9O`X`bv>| zmSLZM!4J6a=c$#YU4Ll2=zAoW;+EM-)W5`G(j}&D96wcQr_Z33_s8k!Yx@>w3)h&U z3eu1wLFda9NtZdtCVNxcLJYMWadrCO>sOq->wyceuX;LoHn4~HuLveS0o)r2jL=n* zrOXskwW2RJ5+hL8p@ZkSF{1_Kb%Mf8VhUr$Bc zV?NL#SBShD#L}Bx2F5>|FB$qxRj6QD>W~Fg%zbUfnB6|B%sGNfja$^8m)d>#;+OHn zKAU@QWFY3*oWh9oQz2%G7;v4K9RK$8CJI<3M&AWE6LVit!X;d}Pn`YYg;|2=F=+C{ zbxMwC9r2)Oe0*GKGo2}lUcX1*6A`z^-;%`75vJ;6?FQya6=m>3-bFva?2|%n1D$7o zE5y54<^NCto=t?V;zE-NE}E{CFxK-n+O*DuU)S2>%P!Y&|J;*b{x+7&H3!}T{2iC8 zVU{r~VOG*l{ux&gX9u?OQo1Xq!VaAgfxGUndKIT!Z>8s`K%2cZA$eCO1kuwKtEwRVF4Gc8w-nRHwnNdU^_s=}F zZ&$chnc)XVT`)1dxG&lsLZ%NdX3tR@0UbZxyEV5ir-z?s5cUmEDAZ4nx9w&m6^HP* zT%J`wYfKMzCw2i)=KJy7OOt)^9V0l^E2nU_UHHcVIpNyq{ztCu{(2w~B2M!a%6t+` z!lR(A1OJU*H^tDobxnk-);DASf*N?r7EB#`Z5YhLD=KWwY-?$>w!>{n`!9F0swt!| z>N@=8JMJ`pw z1F6akzfsa+WJ=GzE$b$zhb^UbyRPP(Xe(dDLV3GC_59?XcT_I_OB(4}kp#?W3 z6P`+C?w*0JbF~m6PK=!D*2TPAiz4?4LO_&jwL+m$_utJ~u8Tut(#!Ta4}!KX!%btj z(=4#^8!de0|9u#EH~d0{OUvgBdkwb22wfo&eHf6p-QjY%G_Ej)_sLm{8tW3D&7xZB)stPThdq1sEh(vkHg&R)z4C|S9`lcB5oI4 z*hso({3p|6n|@%!f26*0RpZrP$s|vufL_5;fQ$66zqtGM+C6(PzRG)h2`3k{SsRUY zCll{hK~!Fk{wj#?_;>%bzx??P8r6WbCZ}Gl=3p z`8zWb1^|FYwg!^SD3gDF_upP=>b>|JU$|fDZ@!aw59CZ@x*o&%|Nq9~2qq0SNf zKiy^i^CN&;Gsq`yW3USOF|8Hxvu&?4SIdJ!o3=AN(D0%D2FIFdMpq zP;UlwYVGu6?{`4vL`kGDUOy+Kpexs;9q23xZnyiR`BKr4t#GMm7JM_6{e^PZa zoB^~{!K*~<21x6wjQp@L@hswGJ(rh#81_m<%(Hj_We{|r{ufIUb!Da}z#wQ~mx)JD zS7PcwvU^YBc>Y_G7>#u@4~U{jxY9lklsKYWPw_vUB2wYi_dQlDtwnf9w6czpXyxI^ z6n+Oh!x+I(MX~5q{k^XK%6w7x)i~$8<}6BnZB>7@HT?-y0X=upYR6*hu2EY{|xZFRiMy-wVaR_k{xLSu9}8Z--owWxOb2JolQPh>SFQ( zj<*|fvaM$Th=sx< zG+$PJe-*4Zd9PNS=#Ot9Uc_iW<21fuQ-5I*5E)bfclz14`;Nj_KOpKZJd@=xMKzr5 z$+4NPAK5-*F)A^st%rAsbJ9YdN4Q_~a|xV2D|b6^%SBQXTS#I3(4YFE!t{?^m%r>9 z;^R>A$haCC{y+Rp25hklLDYkMI&kJN@p2ao{tBufK{*3OOYjbf0_1>^mrMxW;iZpk zGVc9>4D5+UN6WU7-?RB_WFX5A;8s%r@x`FdQ$3rBPR#op?uj4K3Kht=tfB2saA*sNJT)v4pSET1ibcmyvt&juFFhRAO zFeTD4v|_OKxsZTmyiRaXbfbe@1b-AU;F|J1Ic@@dT|a|b(DCUe^VHZs9Z7i8AcNKW z+dcGAdbJg4w+$~TCf|eTE!ji|s$l-{TqDpFQ1YCZ+mFn{4W6PTk&1h-MKW{6mihmr z?Redk2Y(|E5cGzWBCoR(l9YXaG{^1~RUM9Lj{Z&~T4$G*n7lw<`&O+@S~VH|rQmvmW% zC0e_V*TO?7rI1Zg_{yP25X-0>$@vlOo%BFI0i2yPIMR3`J5JGR;c3U(T#tsT_7rLlsF|OIFxhM5pE&mG zbq-*HTUU2wCNmSh{%+{bq-pozh69QcObXBO|`{B+0}EXErR1;-@vt;1b~ z(}wg~y;cbsm&Tm9glRoF8ZU7N23q27f1*41T|M)-@RwPytEz8pXArN?$*Eq`=GxFa zS@rrm&9dWUs(CQ%&WDHF993_Yzm%tSE^n;a3YU~J=p@zax68hfaHvpkYsr85I-r(% z*%1;E+3-OUa~{yiOn@#!+QpRmhUMUuBAWmM;~i6LbY|Am_mwwATfd)THU+q{Dd!%H z3C)ZPtf?E^3eIhdPgfBy5LtCnNS zBV4D1mYKU`3~!hO?qjcwm)9EJcrY0~P7B z(t2HJpELrU^8{pJiQ|EsR=EOSHEF(BLH-lg=Mu8VPMY6YRZ8cfKh@bKbq{f+j_M91| z^l?873%b`Q@6|3kgkuhlZ3;lPM3a@4MNk-OoA~{HeE+CIYbnT;!5>!e7Ds4f3r06dCz{Fe4SfLVYBhVQSvaYhlWk~O^fYZ$krJ{)-DeR7(upu$7g-I zry}voR9Sbi3Zacx8SjP1-@!zaddP~2p1f3+&;51VNk^zZp02G_)eWv#Q+NDpEc^8; z66ahD=MU>`+$gTsbZcC#dc9m)Vpb7dDP6+6d+#B&U4Q%%r=2W-(1!z<`4j&xFG8Mb zWu>Mhv)n>N80=WJCRzjnzy}+ENG*P|GTVIt-khe!&f^32l~Z|h^+!MZbj!855od!? zE^xtma?^sch3O*MR@y{h9W;QK0;N&@0x>Ac#MNux};8$-6I%QfZlD_uQ zEG25@5GlG{#Ls3)?|6~s3snT$sFfkVV&qX{R?}tEN7_-+_itfClHeFX_qT2)i{_zJ zeBGtayvcAfi1kFzZ0sb6(H6|q!0!tt*K|0n_m}OR0u1Qm6|n=B z`Q%{NGwPT6ZyUSkvaaV6ZDa9=T|%9xbU$t@6z$m4a9H$Ppi!#4&_;7tM^z)I8Ly={LRa zi)w3L8e+7XaxZbscU!K(@6*1)9L|fR z*SC8BPT>Ti#yG2W&YCuVPH%*334S%UocD#hCIYGuN{YD_k4ZHzj=0{yIh4NR;uc&- zaHoPMCajQqw!xTFi<0&h%Oech(uwYZn2i^%nz6b)Is^fy*b%~VL_rfgYn3*AzQeJ2 zBm7+HKwg1%((<}WX4*_oi9SY^yEhF74-X1YrEIp@ZtVD@zSj`MHq*}2YYZhte0yy% z>Ba7Y^f84Dhamhj4*%=M2JnU=y;(7XfbruXKVb8CgqiQ%$zB-fo>lk+v;aG*V2!O9@o#_IpqFz&x*w zQByZai35_V6^=t;aB{9hHtT}pO+3EIFYpBC%dFhFq;!O~=hdSqis{>_ReapJuHwieC~ty$?>Fc~l4NTsH#_IRrIe(V`IANF_a|u+I3#UbOt2oG5liv&i7f~aMF%PaV5JOUzr`8-M!hp0rrC9q zuTpuk%v%sVk;>&3CZTJ0FzrHg`}rU4`r{Yb0=3qC!gD4KjQR!!E@$M`H0zb*Z+@<= zbj#7Zbj`x;3&#Xs<%ri}Da&5>=nWlwbUUB}9i)579+7N$89xT-glww`r9fva|I};8 zYN|Hbu4ID31mJ+4MY;&1)k;n`%`3jqLcpRx)#_{CTNmXOd&#r$M1Nn4g>6l!H=4TY zRYbAf$R&C80t-_q#O5Ava~e{7C^$mv4V-F$%F>+fJA^SFcW2xbn9h4pF=043K&L2^yHMa(w!bp+Wzzx@ph3v1DL_Dx#mFlM!{t=OFI8I|lvF;@=3 z)*fv|7GTZF^p4;2yVh7`KWjz5HoHMC3iWH-0)*O7tQfgCo#Nc>j$-o$rcMjaQ}d-+ z_$5l!=hIN>y`Esup3s zW>7JdV^ZL3=pnC`-Toqh37CH#J#j%YT2LqE5HmViHDBb=d=Qg_@Bw{;iYXAyFqeli z6}QB4c#{vgTJ~4hG_ug&ysX^X4I9~7U!37>u-ybjYYrdLPw&2ICa~Q1M4(|R8_J?qc}`!Wl5YzQ|pCZc-k+e%B!yxm*dUGgph|xJZU2ZV+0u$_?C@` zC5~cRI#L#cS>AGpqI?Do)Q;>G9wPG3%U#5Tf^Ku%EG{bkTkC~9QA`hY@nd2V_N|7! z-Xz!ShaSyi{S?i!Rmy7X)^QA?k*Dp^#EtueZZU#Db5*zd*fNLdZxc|FXzcC~L8_Hd zpHyx2P=Z708)u29Kw>I24>237&(K3T^8)L-*^H_gf{X&3pR(iDH3g*!`){n1lPam< zwReyD@_b2DsLo9y2*hzt|9av0?p2c4yHPS@1YsCb)qk&$3WRzLby+=c7v8umUYFa7 z*LADc1SnXwg~gAaBn1b~@8EQiZ;h5z0NKd7a*8Xx2n(02uu7)#nNNkc8$M0dvU$(4 zj#0V2RH^XJZP16!#<*n8Ta=j8RQsr$Z~_0q(bj%AB%V*1P=gw+<3g!d3kO6Fdvy(5 z7hoeJ0qFE{bY#2!7P>;+u%8l(s6DmF*0p7Xs5PK&B-O>5J0_crFtJvcL}(pd+YTrB zPS#ky+p5Hul;ckk+4{uBKnK!KcdF6nhaUr1=)!OTlzNmNL;e>p+5sry`O3CiW#>h$ zo>2}j`?GuDPZ(}=FO@TH8Fu@9Iw%t_TWVK$yJd%+5VlboiY?qO3OJxyQAYkyUZaF4Cds0H0VFs2?^Kv3M#&v`l8qW4#)v z2A%6|y2YgIlVq<##MoquXwZd*UQ<~v=o6;Ma?9q~R&RuHBR3P^DH@fhE?RWFSO zinXs|un*_Ae1N4;BgZT2Uw6($+RjOb)5@;;7MlWz{?~1K!u6o`LJfL8gXNW+&cIum zr6N!E_8#Ui|Ir)v0ePxGAG% zkSi=$w&>y!%zca6%B^YB8`;~)&`}+Usif13uJ0nnT6i)?YHKNx$~Gg_Vub1@%4NCM z%p@mACE)l$xVN5-bNXiRcB=4Iah4FhnjleuqFgEg znNT#g*TDh=-(;nFgmTsHCc_LJdykmwX;;cjGw9VVHNcA=CVXl4C}l2seDCBR!-1dC zfh=0Bl|eWU0PwkIxIH6Qmn|hd9jn*Bg%Iv7DcW|N=mE!DnDvy++Ttj8K4^KBqRIHC z3i$5?=QpJV-0NO*E)ZJ$0RxKP@u~1@$scnnIoaU8G=&ydqO!;kPO-PdsO(9;wvvx1 zaI;@-)qWum5D&C)wn7cnJwdsIwoLS_N-f2H%4LLda)n{i9Uk|`+&T6RMXFtOc{i6DHcXs-G+J)Yaq%z=9u%5&r2>(wPP?pZ*;DbI%>_T zvh)yG$lMC@;~1RWcbdkU8MK7rL$yIRdJqQ1(|gJmC}nIP;?kvgJay zvSbrGz=sc6wY=T#JK&b&q#s-C+; z3h-vB*LLumF$zs_8xvV@?Q)lx-wu1%fm{q$swMh0&mh&S?%CV6cf!8SD-t|DyGTqOIHtcdS7dgp8`3o8ffj8#c#hmxgrsk)K5pw$p0y(F#r~frs z;D4z0{Xi*_RqIq}?NUXZL}<=yAbP}92DDWK3T~e61P$UorN}P4;c{y%Y%N8@KR$ED zcjUxBP2u6t=9(5&D?b-gD1I?!iMS6T#HgO2BN7zjC`Xp|)6 zJord-HEd8fzpCa#mOYElU5q{U6uvD`$dAB{lpkMjWl3 z27C)nlSd*miMDcZ!%yoDrD=4w!gm$E&k!(NBU2jVkjr#y&r z-gW7EKDQtNa&6D@_$bK!4;1qXJmFl3Fl`yMm$YB%&a83z8E18Q+%V%Ch1x?gE0RRo zk~lL}*S!o#CFe%&f%{;hl>21>&>_3}dZvbXjBKh%xHWpL7}=w1oB&P-&x#1GB>9?m zM|~N+R&QVZbcl#Dn}MZm?l7V6c^l#=+U8ZG-!%p(IEGR8%Er)m?+Oyc62>_t>Y(iK zWbMdI;&fFu71?5P88SO!3PR&aaBYaV_!{!(y+27iYmD-x!$iRClK zh+_NY$!2h*LXS)6#JCbAy~A^N#$3p)o7udrp6Du!YnjvNib=ltabAhdiiA`7X}JV3 zJ;d{V3s0z+Fx#(pzBC;Vj@xYZUTZ|g#~|ioQvB<3#oOa-)bIX*nIek4phIY|Wj%hX zgznae^QyM|%h)c-J#sn`+t1d3xi>diz_@K%?Ce#wx&)(czOw|r3vv;mFh*Lfg98(m znYdBTIYvEuG~*-7a^+<6mArz}q-Q;7gwJ>HP?XS4-`GBAx@%;u6D1! znG_&<@z4oF7)!I;uze=|Ks6nrJKg3is^u=$E^pD+o;eLW`4gErWrj0r`+lUroRyQ5 znjWwjz{#?bwiA6-i&R;+KEe>bjIyc1Y|Kxj7R$mBn%^y)<8!ARGT=-0wo|!WZWjHU zt%grjFmoz1x+N~S9ipDc>r(7XUdS*AK28sst2#g5x9OlRW1(T++V1`9xN*_#6)w91 zF?gYL;4?L)_?qcN=`X*eU#NDP8nBa~=n?eMhvPbWCG6iISMXz?&5_v=omhc_{rviV zyQ#y=aSr)El)6CG(D6p@&51He<&*!m$2)QKPgwb!L6pwBI!!pZfvl(Jd{nPF1#fX6 z`bY5ng4uDIJlyr}YaX}z6E%yyN}jX&(JSr-{|^2R*L)PoOt=GTvNk{*!@d;)Pf+pg z9swlOt}kuBoHD!USR~aR8Z?+3KJIvLH}$suZpfx%B=G5iu8aq>!Z~Q(aWF~|+rLLH zuY~pSJH!AGX|#d6P$!fZqIEtZ6bQACBp9WBV*dAE-!B`k_7ubw71!>m%QT0Z{QHOg zIhrl|nNOVWoPB!v%&-5QJVdV~kJ#O$`pYZR;;1}fHCF@C{{{;sJaG4+Dq2yjD zKGEP$p8S_t{y9DRY=IqeicCTm{`peLJdCUO0ZbbRhlDo(=sI+eUf4Znzu3*fkDd#I zU9|RYHsFlOGZ1SD`~_wB?OpUw2S%~z$DOR?+528y-u6%~sC`I2ipFxMaY0QyGf>8E?& z+P8=zZE+9mL1< zH9U&$k9G<0ecU&k*-yrBJ6E3E|J4`t=4b;;bwKS%S$hlM2(<(IJ#0cL@Hyew6H&Q6 zTv%qR?^!dA(Bu9l&$J9R89pKPg|1JVh(ia9b5Bt0%%kuVsYq9p)K@Ykmgw(BLRk-i zti?YIfBnvGhEAM2xwiu^XZ!;cQaORzNCtBZ?aE)4 z{}8UF1StCrbV*+QrW2lN*E*rxWj1EYmY}9D52lWlu?ke;`XwA6cP9d%a^fiQ!^&%4 zkfjCt!gn*9#@I;*!c|NpD#wG=<^a;bLb1r>^oMzpP^Oo=x!Nq(>b?OBiR z{W0&W`OpA%eNE$EKi=<%@No35-CBg4S`jB@j`I(el6@D78b5V|m=ZdBTg<#Oh)uro z(Zr}(p?l8FGo<|~>9{ZKABJE(fQ8J<{5EUZWwg2vw9OG@qaUyol#qA6zMG`e3kbf+ zo{YjLBN=}m!gq4x3S{7k!cAYD6KzI1RDQSdVBMY~&OhCVw8c@w#;=w=YOF|qCE0I( z=}b!1um%gKWlPcB#(k3JUXy?k!5$DKsB{PDw;M*@H@?h$j+e|ihOJ2b_*8XkqVOh& zcGb?ee|Dj-)RVQ<*8f;Tc<#}iuDMrO&Q>nW;n$<#=kO#;&=CaH=2Al>q%#w}%^rRL z#D~u~{?(^zK#k4i#6}Jo2kPK`5X@}!(pvPGbNd|Y8mKKQ&?4w34Zgt8&JaDD1T+PO zmIW|$8)*4?+Y^cAA85^^?M#5y#M3%tK#b4lIRU>Ut5PB$0yrH!ao(yw--0{8f-UXJ zD@GQhEAhRCPpFUUEY?8J6>M_`)hH3W+JDQ%7_AUdN=gRi4~d{flj>~j=jn5sihi4` zR&Rs${LJlm&$}icS8z_azAJG42fz(4#COT#2MSm#MDLu!klx6le6cgh7lL07yURw_ zcrTP!D*zS5SL7@(8knC3zQ0!>k`T&5iFbGyM#L~<@7nN{#sQKtuz2f9(_l^W13hZ3 zXto-~p~`71v&V*EUhAMV7B{vvt#E}L(jKo+6F(i-VHlv~mW0ee^N!GM;#5cPPR2sP z0uCCLWA8}|g~rY3Ta)`b?dMI^$&MCvygFM5}x#*194=O3?6GQ7n31%0)% zc-EaBcG*%5`(3snvn4GWDKD}=*{ng%I*$Q8Yoh;RIQhx77uq0rbt{D7N7SN0AEKze z2M7H)4rPuf&~AdQ)lAj05RZUO7%Oz64GZm?#;B~c4N7Yi@c5*Gtm!z5?~EYYv!^Wc zVeEOmsWQMaydrs;!*zP!`W%;0as$XUIk?Bu9pvNm2nCd|HRbOwWpu8iwbuJlV#t+P z61;HHja)BF)~gA?=~<}CyBY@G>AT99TXaEx@rM`nAh&$m2Y@Ho0B6s|vxco}_E;~? zeYp7R@t)yH=elNsi6l-FVq{W)?{Ha53MHK`fMYWI;T?r_Exa+`=0R7$-nNs!+TNSL zfm^93#gnV<;K#^@^-WrgVG_{`Wi$RRT_uwl@y^n~2#ImqK0v$%5Np~nASVfHuy#VJ zSFW3D)RGTy^v`UY21U3D;x~}B|1gKGCx~Y*kq4SATUQKYYkB zYbRtz0-i@n7x)9=Lf4a@!i9?MfgaminmiL}0z1A>NaUCeSQ{5u6_pAJRU(UL7X3j% zKx+O%!&=ncV9?A92-MPJ!IgY0A>DFVG`VHpo!Kco@X@(ZoCB-I4{l}>oWfpNIIGf_ zY&CHhX}Vfc@};vM1vbBpC26652xsdG+tp_nUQ-l^>2>c*f#2#>r7RonKFZjX zhGw8CROBLR7^AR}fSEyrx@<2m%utY_Nj=n^y*%NM{&K?9xB{by7c}=XDpdlt(Zx`3 zS9aPpaS70iWDC9f(7carj=mdsax^s9SHV6BzDuX}B7*&2B}ObPS(@#DrXRx#Kz=o1 z1i9Fj1W}HuTdc;nN&5c?u`+|~uR`w^2axK`ZE(}Y{|}4dE$m|^FEC|zDxsf}zBwZMf5z zH+0c(H@|%Lz^Mv`hk)|C7tNI|!Tvcyb~Uc${p|;iaWb~o4n<_a_5Fl*H4C<5({Hnn zZr!hfXIyu^fDQ+6aXv$&+#K}g+_q^%IQ)}AfvJ{eL+%tLn&790o|udMzK?f)5Ck6w zSFA9`__euxHumd+q~cUhU@(Xk6Y$?O56~pR<~E{Zp1%}j*H8YK1+-Pk=}=ErZ_7%B zyR<-xT1T!VyDJEPFqbjzoCl;w4#zvF+lNt#?Q7~u z$E$sZn+#d!SuSV>9`2glCs4WDdiYwylP~l__M?E%wt6_-9qm2j^nPQAdBaH3*n6Fa zUa+cOv$#m|xg25bQHrku_{l847|V{{9^vfo8b3owL#WGIwe#jgp`ZXY=Ff|91?*`7SsiZ?uO zp(wVi$4saFFaykEh{^F4Wax@VZ~z_?8iK31GcAU3>&%!**V41z25IJ9geQ|Mo_X-MpHHGAIqyh{dS#rtePkc7;Q}| zKR&Zv8SqULKq0}kwdg0+N-eo6LyAc=dPRBk8ed)p3iUc6X9aywW@VTcv1eWPRNzWf zb{N|(O5N)Rh+qqE@o`!r5BXL7*$tlkrR9bg=?q5aO zhwR4a2=Z#pYf?s+NgBb2QX6|)A66RKUws6u3S^uWzSj-okwXcATX(y97*) zLqEdGW2#`AN?ulU6u|bf0K_0h|{Yj7Bz#J6&pYN9OZm|0pW&z8=*fiOK z4Cd0oil;AJClGA#bGf?&uybgO}0|u#xOg2TZeD?u^Y?zKzZHq|TeS z59Wu{MXcit@8PqNVhRt91G53aJf||8`{g|aZz8w6T{H9+5}OI}K8&)@Pg9xsQ5Det z9ofEw){uZ=$mjLAbqc&R$qALzn@|4guGb}@SLGQm;M1(FplL?1JRyY<21KT1Pn>;z9-SycRI$?Q*QM3=OuQ zGuH6^8CFxXpFT^!z16}~G3#ZzGKYXvQoclLa-t0{w4 zbmXe<`JwUoCX|=0+Hz4F(47};A>3+Vp5_&PHomxO$`_uRID%4kVpZz~j?F~$i$uw# zs`;!%4=CPrq0rflQbiEcro2>+A{NUIyd@Tu0pei^?~KC}BBWmczcRWw<3Eh!FUuWv z`UsKi`Ewis9G&groy)tmG%s0HNd~MO+>@%>7L8{~1>s6j-evUhGkHA+H5*lc?|r4E z`62Cv=9@Pw)o}2^556T;v#B9L$ZXDcuecSsxWx{Wi)}#$H4&o3!c-1oZh7nC-C9mJ zdR;G+Od67Gdw^X|JJ1&|IIi0h`JjPUCobF(q(Sgvefm-~vey%Q&mj>;v6}H9CazOz zn#bG1^v3Y@ao4qs1^jKB~%u zwedLePTIn`jm!TC*MC3!e#-;x`~qL+!9If|@9WVbAi*qZb>hD`!UW}3IWqbKgya*c zm6pvlbwDVKkz`6g0*)ExxCD-aH{FVjl~aQPUi9UUKoX{qh{#rb@Dk zTrWn(R3dFGyuTi|-CJY#EETXUtw$lGHTgkd?OWaud>_=1v-9qy3f%Wa3r&n_;`=P` zL_w@GL{^)2c~8Z)yNlm|nr$8B>TZWRw9=OWDneAB5T-jE? z4wFeTFV^O)pK3_d7ave{CeK#M61M}4Mtu5bp#$a~uS`EYa9bw=rNwLKPbzMUul zr*W2ZT^<6I!QRqao==@Zm~J+8A8jdHS$ag&jlfFG!G)z!E!t(nnb?}gbe~gcCFk3& z5l?pBd(+^kiOi+!I<45NEty2pIASL^0;7e`1o(>c5g+tpQKpy&KuXt;))ucInINIi zWk)D6hxzq`4eyzMzZ9P?EqklxhT=Ld+$f$;tf1dxDp+RAy9KHYAf=-g#!Vk+1MQm! z2AZz|4D@lIgwj?dZJ?9f$rbL_OQRTwP4~*^=+|r5ozNHfA3e_B{j0B-@h19ISabaX z@BX&7lf7ssr>3zFxGuPSgo8l0!^d))0E2qe6IWPwWuTyG6z0SJAfh+88bJ~Xc*E=LEkn{vMX-wVoDN%>$Y8?)LmWa>z5&ba|ad6V^J!Wlx^T3>J2Il zEPUa#G%tpCBq$2h*+`lmmnrcf;#{+buNv%%#*p-lpgE&ZR$i53Fe2mC1R#;JOU2!y zA>18fG(xav(sPFbbUzI`y&TLsMO^Fn)~}beveAp`mpzm)?d)y5Le<8!1~~_1hM4sU-|&nz#4oJ00J^-4)PM&7qo$ zK;q9W%0ML^LaP1xeaF_0L-g`2QfDL;h#os)J|DF3V3vBxyi@Id+tC#Nqv9F>J4hmS zdCWcKOjO?Sj#!dK3#pTBdH-^bsEk`{1+C&Rf0CdSAcR@nJm#~`h?c?uOjmObST6jq zx+QLOD3JRqjSIKLx-6iKR!HML?-+XGXF1c5s`LODCH$-j!tY<-6|rm|?$5%b(3{Xd z2Jp!%4X0=617Qyw2KS<^x+TL#Ed@j|nwpYRPMC9yUtRY|M@UvY4N1RSN}lq?U#RO< z_1}8X`TcQOZ>NjfTy|!XG@H;h=5k}JxI7PRhVhQKl5>F1&>mJ5D`!wfwn9)DaV#CC zMRy(-LH4hkv#L+YXj6Ja#?qtffX2JLOEsQ-y~uH>*vh*-PHy@%-^Nv`JpPod>>I*w zQnS+Wz(W{!VJVG{Kh3LPh5fkLcf){|V34X5FpN<$m*>NKZ!DB8k9c4dBice%l-64r9x5+BM#-jbvVv@4<6om`K*OWmNaZ zYN9^r91ujWfD@a-Cw3cj&79-SKMHu$2c~*vw&$WG@t*xF93(&k)uB+u%rZ(T8zA*x zsT!+zKe{Ic0sECY`;RLm&hls6Ect>U8BbucH;xZtcdnzZ$;)L;!MhIW&RgFF9;y@H zKvwmnCysWTnxf%l;9hz2W||Z>Bv^y%5DlPFn}Fv^aY&Hb>8ZN_Et-&4_PkU%jy^DD z3#2p?ry?;)CQKX;-ysFw0q03BbV8`IcQ>@X*xcXwMRg5+`Pxk0Q@dSlfV6w0#ByFBLsS;wjRSvY=- zqx+oXw`MxlaBo--k~kaZLDY4=WmNNQUuIvmUp4)Ua$pASIV`T>Y2k?*1@-kAY<`+= z5&G-p=*D)1OEu*i;MTe8M=Tof0@|26PfwJSGW5vR9+{&~saSiEELJzREr0ML_jnHD zo6u`%jn6tXDo9lUS3+a0aiG4ABm%nU3boA{@1HRwiCM9jF$gW7G4rnBw^g6ThgQzn zAzpVh5*noc0_fcp*v~e*gwwzB?yY>D$=AZm^Ws+}(|ZnTB;&%XfxETzWsL`g(rjGs z@ufZ2WZ?BFoNcxC0F<=(`to@FB4gOIeZ`fHeN1~;wBJQ5fv(`yLuC%pHF0xX2YTQ0 z0%-Tp>ZjC?XDG`h!|JP5yz_4{DhJD7R(m^H-A(!)iS+@eYJDM+(H{v=K&>ZUA{8j2 z(k*$PrWD!6mT)~3J|&A{YcM3cHDqy<# zO4N#651AJ=Qv>(w+i?;8I1A$3)`phazl40YM531rqi{=q4TRBQ@my(i=@fHm=_Oq4(wHUd3J*N#t=w(&1!{C%&*ZBW{ zq89J>>?l5mJt`>$!ciFjY$tP`@Ug36lBncozNXbva^;8O_h+=?0RDiTa2oKXhzH<% z1#BWN_p-`PT<#~v7H32*SvOw(IQ8_&&IP)?gK`L#uz0cc zv_+TRA(5-h;dHSz`=sOOy+%3C!U3ky`1XCWTc9#M z^612$y#K$w_U}K2?cJ}h6?R+kuL8|qM`7|P_*e=j8Ge6>zkY(>ekJhCZn}uDdpBSI z`eHxncYpGwihjR-i*D2I|Mj(h4-NP=!Ly$^9`zsBz`u-`nlt!V?riS6{{AffcS`;{ zC4X-b|96-Cf8yW@?9ddw2}1;e?iFzbegub!=O(!~Z`b~Kq@&Un!lA zbaDsu{*%*y8%n1ePag2x2Xb((Bf>#vY;{=R{Z!eGT=^c)^!_@|I^4v`U-)d z2hXD!T9$NvG)V|b8N$ScNY)Mlc8#RfdH8A;jTf^~Q%6x=0hTF8uXq4+ZZ7&$R}4X{ z{YjnHj<5;{dm4OwJ}IEeaF}+PzsAn~Fdm+dz(Wj&el)4uG+rUN1SoGQfW9qjhU<^Q z{DITE0&q!V3JxzFqS2<<&DV)L+e91C+w=g!#!>|!2tA-cD8B<=oxQzKw|mdN&%A@Q zHYq21-2ezxrgFD|AJi8b2Wz4Z1U?A9AMc`J(x3zB)pZw8Ld7-S(b#(^ zOKfRQFJL$Um>WIk2F3cJtrgWBk8hi1c}wlKdE}f(hM7NblKyI5>?8u?n1A$}pBeI? zw7dArjL`ZeT_6fi1TnKQdT(||*GAlCl~>;ts6y*s>eajm{7njnfOwLI{&a?f-SD34 z{sNdzt-#N;_#XCyOpP}|nt_$q(SsHZm6}e+W6X#Hg5Smd`HX-4-8dw$nqC-4+UvzY z0btexfZP`fE~sw*2)!$64Zt6p#gQr-5E^BGl=2EX?gL=9*Za?-XPbpkFGuy53ean2 z9z)jA$64Y5GesdSdWVs*6I!>~SI`D}rraSXN?X6E`u0K;AU=0>fWp2lr=a#eK$P$M zZ?B-@L00j3@q3AWAghvztk-O3y~^6)T^rJTUzc(HmaQsf$aRCp45ATkfI*dA2k_Ph z94carP||}>Ta5u*8kM@9sYj=j!HRANoqsrz%)I&liG?3UBl|EMJh8v!0oJMFF}9M8 zl;s0>)6>d?RT?~T<>j%%j{e_1A6UtA@oQ~TzaRhtRK_UcU{S-?(o>TzA%Hp% zz|+RcCEL{WH3Rh3`u>l% zDanUe@T)-QCAQ8v`Xk4Z`aTxw2XDTUsOM{5Xm#ODW_og5SfWAO>;oldb1o%MJI7rO zgGG1SL8r}67VK@QP8^liJ)rGh+hb0c9dx5vZ&-c8Hm@afTvP$YC}HR~lnz(L=a#%I z51B6EB`soMj=shI+a1KM*K=SPE92pvFUkY@@Jka7m-zUiFAtZtf_bctaZ>>ZfEu9wo+sk`aAoWE0Fb++5MO)yQNS_ zS7xBB)Z7M)wa6#(L6IjU{IHJpeeu@i5=+fY0c_Q5-bvxS9zsjuw4cHo5 zbP+T~@?wDpb(}#}3Uplc?Ck0Eok`?m9y{v1wEgHwhe2U@JZSW}!H;5Gy&kG6Pu(W& z_@CUZC2Eq-{N!Gbo9b!`>v4MfjdXBiGC(x#h4X)jbk9UemW_kN_4Y8J+ zSv3=e7?~&w3+lgro!AlwNYYVxS-vdFS-6N^-d==(dCRxrJ6~Q{J-fuCap^Vcs1rNs#H;)HzHL`*NX3v^%P%^fSY)-EI} zfEXYwC9iZ@2DPE{fLV8Iy;1cb!fT_$rq!M%&)j(#B0Ff%1#7e z6xwXBx$MZ}PX-Zyb^0+R3_ka6x?p?0;M3j83nxgzmN9S1Lb4H9PzMUC0;8(DkbQ-e zf4dB%)U{R?;`q@D^yStlA6TGgz-DH^{r&<=K3=U0Zk*<>7tRyEFOv!aAS0?u=JD_a zQ1kGu#3kB20{bp=b`DfUBTg)XQp{H~p7X9BKY&)j0)Y*;+g|n`bGl1(U zR3-K2l8m`6_;*sTXZw2v)HKnhm6Dr%4H&><{FIyMzd1Ki8WhfsOiNN<{bYd_AL0!P zgUh*v=Iybs+o6u_$YG!q$3wgB?n&wm4vz<5&v*wHX|2SrE7dBtBOCT&L>BbA@+o)j zsoQ9G`Q!nNc7i8L1;NdKxC40c0)raH;P8lS(77`fh&iR(P%PWsL1z6wH!7_2Gr5+G z4jVti;|*@PY1&qPu#Yd=b><}-SY$v~CuIpPU+F1*v<^os6x@LE;0oLK)Cg9){zWqC z1hjn#E}(p-P56*5`6FPGavOASydFZ-KsY+S6$=&T6`z7UDCY#bM+FC52}>xTzAayD~XSu6rWp zjfpLxdWfPa;rW$72Hva(!ejW9Z-IB^v%C;Auj@PR*U{?prtP}G+s;@jhxc7C?==X? z$fxW&(ZRRJa_fs_3}D=pDZYev7+~DwvlZmxkt?Qns-yp3*Zux*Jn^5^kKXmpPVX|8 zRc;n!j9UTG;d`Z{*KsjBq*fcK{GKI=;)RpY!s%i?^wi=o3|zb^Zb?T>V=1AWju% z?KAA0fsz&PT7KA}1sDxaSVV`uCerS4>#whQxS4z8?Yqyg@f2(A-jCX7vu&%}#@Fqu z5m{vrO3WbwUJ=_e z3|sIo+iXKX?zP*BV_SiwgckLqL_(^|m-h;Pp?^eeP~Ex|!=P~9VE5a;YmBdpFTkA( z-~HU!Wq#+O|NEn=tuTU(ZQl|hB5;Sse{9|YTkS}j#?zadAS7#)I|LD%ko&yWe~;89{Z13HJN zTEoe9XiUtxv`JUedmjZh;Ji{}DV`Bh$qSlgTo3C5yE3Y+Muy}p=?r^+E-yuZMgiyD z1U+16p=13AI<8SGb#C_K&I>xT+ucCw?ApxgzqDs%GF-l@z;|%Cc(`O+9Z_A3orf)ftfZu!Iv%Diac&1(Nj3^h(7sy01iaRLL>czB6TL? zD+-6S90cN#e@Kk(;!jB=TO|@xK%Q<2Y%9D1Yn}S3W6Z?plX* z^hM5#sC-1QDLtjz15%?c{{3GdnL)3k#7vJk=bud423t=#p@NN3$AG9B85CYh1<_s` zXdz6<7eFyyaScsT6BRGn(USKM_?q8i3)C^+URiwgaSRHk`3n>?LnMc`ucctU>P|slBs-+A(!5 zBUe}A)fj3(FwqW}Dq64cw9LmO%--hj1P>1^lLdEc&C50E&KC(N^+xnN#N=6SFGo*g zFlOj~>;_5RCu{GEXfe_?(5F~w|5&wtOi%{322VW&TmM;{oMp-O6a$UPJz;V5HY5cel z@DLf!z(uN(j)Iae`H@DB2AWbHE!D%z)>WPD`AkJ(v8WFb3-9*@#_SDA;R~B~N%KQ6 z$E5ev4O+%Et9Ta9JLims1 zIGpjmNB_p&Ds>1v2udB>gu!cGR>y$fW-LevY5IuZhtJ(j^$RqCiK7C*b7X+SgJgw$ zb2V^t8hyzn*TM{PQd;tLfj{U;6^I;6n!Pn;XsNPoqSp;6+4v`Q1~8w+k=Rn-9`OR3 z4fa`5SE#?N?DfJ}I+$$kAOJ+}v$GpXbZRU2SL0U7qYDd-la{=iY!^v|Hf4s?6Q>RlHRq5HBwiyZF~K*7Qo42c#2s~Q25e6O3roU|vFEe8^(R~D+12mdx6nE{GK+1WXve1Q4B^H}#G3 z-y1RTym4lrvPeY}D$+T8v5Tz_8t5a7*%g2biPqTXsG$Yo_{ zo;u&xyR{`_Y4o)A}NlF zul1=VN&fvdgcVH~LAMpCUS^@~eQ#R+#|>spG~zg^Y*HjGKo^Ig-A(PhxE!mP=Y153 zW18s}rtW(=gbj`KG=`&pT?MrvH-&A#uEhNM+~C<(j%v-L$2op{brte|nu$3A5i{{^ z20)>uj*+V`xOo~kUAyyUzZ=Z^46LT)tr*wo+O2uhd7>AtzhA0GjzsX^9aUt#_kpg! zXkV=Fg$AXtG;|C>Pid*Z5{ntHtLtLhiTd3GitsdG0;OF+1UjI96Dk-2+wiGapypob z)VM^Kje88!0MLkl<85p39Df*s-)ha2V4$1k0g|(*sA1;)DIqO(OG7(WvX{v^hrmv7 z`n8%fEx*&-+ty!y9jx=LLqvb0{NHI{K6}^gBx9&}kr5jKtif?nPu!G0LKPefEn75# zmI1d%=ivIIxDL7`|G~XKQV%#c6m*hl04RMCumVM(gMF-A!d8jE{h=BEQ5_&kJD&x{ zn>HuJ--O}gc>p(h=;^pEi6O6rWA^|iXR3R}3}~$eXFkVynpYWywXpTM*Sn4=^h`jUHu6nmy~UbMPAKnJOD+{Iz=<)v*E#I~=B4t+uri zVb@6dz4EBsg@V(5Sg1=ee0BKeEu+^00;*{DBVgsLg{4Rt;%wV*H?$lh`96?BSp~yU z!!SS=FUqx)%soACaRX?$jyY6Ax=xCgO%om6-5w)NlI6e)e0@K{dX6hsg|(;GryHTc zF);k%Q>-ym1AxyFLIZifi-%SGG^%3%ws;pnS6O@B;`OKgvn8poY?_4uD#k+d@Yj9 z>q+fa)!K6mo&|{$v8{$0w!l*$^--^0OlL>hACWdv~ zh7CDGjAz_y>joAajs+{o`=39cgQrOo=qq%lMv7vm|9x_ma{BdPl%#Z-#2Uw z+TLV*uWMLnJO0`Qr46(YA?F1*rZN8f>#J9GmbDs`u+ev$+?S%(%FUj^h1(-3TyuDE zD{H}BZC!H|XO|B;`vo=>Sm zYktKAJiX?H4$X||hgy1ElZHy;J2nk*T7*Qebfaa+H^Otc>@u&^j_cY5=in8P3T&jp{sO-_DwWnJ5+)$z6?Vj1)eWvSF58UxCuVGdw(U%48u(wd(J4X_lCp)+gP z#B`?*kGAD|?wA&VZ_7$Y;5^uF{%Azqj5dtF3C2-@d6?s6i3GJ#rf>EH(AI>zO?V&n z(FvTBrB1rTra6)_iA#`Kt9B5?bngyrLzINb0vVR2Ne24ulTY1jqi!OuXU`L` zK%?(+vD2;5XAgbKSielDO!7)h319QTuV{7Mw)cqT$f3b(A7^ODI6ZOMf z3~XGlr~`r9X&Of4PVpfeWpvv$Cq<6XoRZ@81+KrRvBniXQ5uYli^!~kqXBkbo>0bs zw&T#sF-46H9XUzv%y#8|U`z@F!0#rXt)1asIj&voZRxsXp71D*|C(D}-GGvTaX*0} z0ia|-cdS6iPB&OhZztrx2?9z?rAOrq#b9I>mK(7dwm#YW|JA4yK zZ7I$|@~hcRjCf+vBQs2+`78WtBNNF+R-wyIP!)uJ5jU?L?I@l)fAjILn30XLn=Qwr zZG6M*_|MN}8O+^+E(t^gpuPyc%)V6&N$~~P}als zdMI79*yyG6*XQ)D5brRDVe&vRCTyAc9e;{$;_QiakuMafq2Q255%YX!%1alCP!bR= zxr4UKwU+Syl%>1w`lKq(B@|r;moya_4psMTz+9QqV%eC@c_`g=UPBfF;o2V5=0pk- z;t4d?!-nPw)}Jz_3tcLdi==7J|K)b@Mo}7J4$$%Z+@z|k)ftNCdguNKTbom|B`yRr zu9Td-&=A#eK|tos%G#4Eoc!$3TR%ft)OnRCV!mu`@P+bg$t}}5{tgO)J`XT`mu`rO zh+q%O?U64o1HT;u;{9s|EDv-TJuNm|*5@x=Y{47f(E2|1YkcN0{{jKBN?f7ru%QCl z;~1+^L^!-VuKIzIN_Q)Dz~MmroN)S&cw-q>DrMd#iW*m`h*~xM>h62y3YAm(yxjC%jZiiZ!UOW*_dmqTWK*`$z>51YS252VaPM|3JsfnUBH$Cxi9f6 z_va9$D`%P4g~u&rcx%2OsZBm@q=96Nl~9`vhLV}W}njRk-&Z6r-aQ;pRt%P$O@n(vZlr8`7ntrh&^E#>j%T*v(q&jfR&j~j8N zpnpAj*5^?jC-W;JWh7VU!+}(0!W*|CP8bFRx~+?R=RJllHx?f9`<3tBHF7IRLkane zK83;;efOnn<1-jdqOPtwchXpr`BGXT`4&ANSstfziAST5 z?ald{mO&^n4}jk`4pHOBRKM9GbAV7A%TuVPP`ZyNZBs9$!VJEtJO-WwB=J@to-9Kl zUvMgvY~(sm;4qS5d)Mf5h7t9Rhr)IW-R(fja&a$!h|i=*OMr2Zobt6D9CyRL6hhs= zUXHDJEJ!FVP;h83l8Qy;&=MMv zc#BspoZHbVu(zA2Kgg&4IUNFGpN0ga-PHT|Yr`QViNemlshaHJ_5QnPgz}rWg(~N1 zx5HPED;9^z>WSHI>y_#7=J}B7Tee9gR^Q>;bA?xzwU~TqyT!@^alMC6Jn5yglP3o zA4nEw4L%3)j2VGQY<5na*Vq#;;P!pd|87C}y!uWBgm#@L%T4a7^Yr8Zcsn#Ki#`D4 z3}afhZCrFM(Fm$ct)8f`P~{xY1Q9~HaI1P#ahWI|-SInpC2)AS4eB4MzO7aoey7fX zF%HrwGSIPkL4ImUWhETzkP^S z>&sj6F2d9_KEJueJlS4?jbU;vz#{u12@rXY$;j{N66ziI zcGl_R^7Jd>Z(=vDx}0>>>F9!i^UsC{W^zicTc+lfY(lR^D|Fl`ouTSi>X?9AxkiQ& zVW|grJLT1EN-nN-U%Vvex&&i$s^<-3TpYi2Pu}Of&RIzF$B+rn7G3 zcq`W=LP{XQZeeV+{Jp*kP79GrBH?3JtR#{pCW;IG*rPbmHD)~c0L9a<9n(1?@12JN zkD*y_OKYkzKo4}$_Onz4Q-#7Wc2=kv- z>c_(UZ@K>a?poX%tUCKWBdcg>@)Q`kB#s=F8gLvuEvofXM5VHVXm-i`{alTdg=|$V z-CSkdZfWh;UDaHb9#759$4m^@{TY0I$?z*=*s5K4G`_p?+R0)*F8V5m;kA}u$2Z*G zjke!jA6ck5+648*AE@6;w_zL8eR`?$|FDhy;gwX7A9(U_Z^S;oZ8{B3w%ft0KKuNe zf7>$(htSD$*=MQ{ff3!uhD(3?`Mj;XLCnwPMmhX%|Lc#xmG==2^6~L`Pe+s8GYSUr zaB41-4_r^*KMP=e;vmAU<#W;DI3?3*t*>WR-)fdWcs_9^D)HIqyGF@FQRRmO=FYLI z{!;#dZuU``Rl%7C1A}rT+@V3_?S3Dd?wIo62}vkeSgkv0{xi$*=X2*bqR_kNr;I|X za7Et`g}C47QK`#TKmDi0`OklL2H8=k1&00~)+-ZsdPMVkk-xuS z_eJ4W5M4D2_{VDe@dEhIf0hb9u z@bI5+=>PnHe(cJK%T^x_F=*R$iLsjvygf4c`t@r?;&5sn3uYv~7Y|}@T?3+UDCLKXgN%q_|@&bCpkIHn9-^-U@lZyVO~K- zx@3fd!r=~C>l)8IySy2x2!}k@-U{uaR%XnrFv@PaKB*2rjRox5D6#wM5t2*pRrYs# zDh1ASMy?H#C0$%uXqC;M8V^q{Tp^k{LZkoJ=+MDK*Tp9Mj>u#C^AnmbzIg|6O)p@^ zF+a+zEr!!|%}l8-OL`&AR~>*m2ngjEvj>^heROw--!Mj-g9#qK@bx30-jb|zz-v6Z zD*!Q|)f0%-y%~bL3_$Q|v%4`PRfKAjw!%R;^c0dN6UeACL_*s3lzzfxulnRyEo%YY ztm1&MNxvMyN4>wOYnA7IFiog)x9!mrxFd3JHz_(;@|e8KqXr&yp-ymSyjpg!R$u9? zAtp(_G{sSkZK+b7icTra@dd>`2T{fuqI;sEQa}I+%t0?$uNh*GC^ZXIQ!oDZhz}l& zd)yWY1dk%{VXBWwuV}KpoB~Z5mdg_q==zYfkJ99zuK|Ns=*yRvNk~Z0)y}qTv5Bhp zBOIn5f2Y4wPuLZ+BcD8pY{jadse)%s*w{8z)65x~?^820&@VCXPC+{~+`_F_2GzSKo_&}}2e&F>aP<{jySk;P-s%A0L=V%+R_D?GvE z#tt&p4D%{`+%@R8^NLM+n**I8O{vgr!=|es2n4StvCTtBsJ6fCSpXK8SJE7Tdukg9 z5!^JoZ8hQL22I=YY;uVBg|U1O0uuzE1T;+)013JAOvnuVuIH8v?08OkInY@w;pd>p``$(wFQKHU*Obj@@zJWFlf%C2<#f^5s zEPE=*IEG|-9XO%%3WH|_kU2GG5%ugZxAHDDy1MU%RXJ@Af2zE=uyPx+&-Kzfd{E2* zs!*j|1%@^CcFMTtX3kcN7^U`A`qx%IM-w|2o

|XJ{l@Hn$-e>o z1krh4!C2VG-0O-(-E-r0=T=rMAlD-; zIbm;m=DGa?aTw&L8A}cG)CoRecAvZs5=&c!nhQeSzVl+kVSHMOB%ft{&Y?qE3%ZG6 zx45-BqORk=CMKHD5Y2E|TAJjaG~nMfWvItAjHOFv%Rjvu$7?~De|O6z>H0p$QA&(( z37I@vZuVa6@*IfCPmUxf2lHNsxHaD=cE>`EQd17lxeAgEbX|(x-ge8cAk{-oU6}kl zY@z4_%L&zCqPU5S*>ZFaLV1%e2d|U6+$X2DZAvP_gl2U&T4=sF~K%+|LU_ zuCkHOkDs6wXadL2LKq)ti#5AJQ$K-rH6_c$sMRZW!6 zX>1%LtyQGFDn%S#{QCScEtf}Mcu;Yk|EFgBSGAJpa;zsOK8o(z4dJ4~*(g;0TQTCm zIS*}klhfB;lx@R3CzEzHH(TB5D`SlFJzpbE+b6P9MR3sOZ-Ae4Dt9=dwN<((OI;1H zm|p-u<{K_Vs-1JThO zctUyn!kcFom({p%)J9)DC+&8(H>P(nCB|K5T0gWVA!~Bg>!XBCErH~q+mY_99qD7b zELcxqPelcS+T_zZYDkv`lGZLSks6Mmyf^WPg@#$wz**gbj z3v4ZkUbvm(YiG71aT|)M+H+^_IkbtF;3Ox{)Fw4;r${hKu{D^$YBc|B zzLS{)nWgL~oj)9jIFGAn9THHr%XMAG>swKfNBA<bJCU(9-c?tz{7kbD)~^# zMn^7`9VT6w-?znzP8!4fvXJfNsJd{l$HbV&IV?dKfM=@<`<`{pDPB@OnJ7}%*$#-j zU8W=xM@viYh6c_^Z!s+_2<^R6mt#5Db_*3iHr>Ee^55o9{XHS_$l}CY=JJTo!*gXKEdIVPuH|c54As`T zJlE>2t)dB2lYgwcU$f|qy)0kkq9xv2ajZ@D%PCp(OH7YNX|cuVP+kleZ7bng}FvMtt~LELST2|YHnh5kXc#=L^;3WUZdTM zeYN}Wvg`IUT8|fSvkbIlNRf%JP?C`sMc?H-`;vv%{s9D%x2BEs~j8jOqL zohd?#zuM5Q_JEfx;;3`Kpu)064&SMEl63->TuNLy3N&a*5_*;FOAY9UxsH|QlnNEJ3jautbfY7CP^W6sEs~0W$G~1&OOvkzp ztyvrdhz|B%FX-WcK}fW^*aww%n6DT5Pu!xuq@Vu2P|A}OhQYY_$>yhvrMDE~xw^X` z;emnHZPB}5SDlHsQK#6o0AeV*P6#?++41RMT+(rbl-trW3pG29$~Znp|1sh?EQ`i- z2axtvwGh-#33yh=cYUPksC7!{fTKbVAdtsImpDy(p%cQz&BK&29-K**5@zu0sj-#A zgu%%eXX$*t=*B6**$sPtE=?rwtV#pRPSp`DAmZ$~H@E^*gS|CMOq_lDyrkPiQJxmY z=lEE-<-4vj#0qo zyo7A-5Ng~t`<&}1!Ofdtq4PhFBqeW~HWDrTj1<>@)pE(#Aeg}NPEY+SwEte)?c9El z6>`YL2xSWwufeFbz|t!8T7-QZ?~S-~ZRb3#o{m!1_H4q$VB%1_O_sLBmk9}Ma6LcU z7%I?v<_3Cyc4!C}r@iqoT;otFUc7i`*P2zgc0`qo`S|hUGG3uVZ<}=AFo!H68+F(E zBMw7r%8$$In1Pxe{x)7JJq!|IseeTkskp=K<^5*X&Oeoc?3$#5b;T)OWVCe5n*nLg zg1{EfJ*@-|*Z9JCp4=HBT`>VR2RCLWx%xd0+4OhCR4uEn^1du30||OD8*P!w#)h9- z*co6YeQG|`PPx=78&g(&c&TmM5X^GsqMvf8pQlUNsXo@@%3CHY8XS$o#w3k(Jfzp( z{^bxISkT@(NZQi)M`%Gm^d_SY7alxMeJEKe3RBEqngA8dr&YCHqVR1?_DH*i-5fb? z;l{oas_?l|)*Y&6oL9QI`n9eg*G-eE9?sCvumxsnAbs{S9Ox}|y972*%CSvg^$dBFv0F$DTiVpAm_@Y#&=Y!cFydI(MdY-q92Cy^IeiQdoN3ju`^RoP z$Y-7wybX#*PIcVs-nq7ih5_y#J3?h}57v-IiX80UqMV~!WBJjX4A5mzW!F2i@H(j^ z+*-4<)K=r2`WB5}jWI^&E7(StK2n*ub$-9iCjgr->%$h3eCpaDLF@7T4gd1zE2onS z4oaRreBJq8F{97BXT=^nE{qHeH>!3lgP-*?8M{I=2;jgC+wVw87VTsg9UpJO&V(rd z?~IL&O_D=wM8p)nY)}*iqTuk|jB9oyUm8nS>fX@m(2wDr^4SzRoIXX(uH3d5xy2^@ z_1$r{aPV0*-~2?Q_wPXkrr-!?OKYeq!QYdOk2hc9c^m07qUa;?*Izy({_B-WDHiUH;bBf?oTPOFcD_cu z%O?z`kjz|QGo1oIdzycoOIFS%g|*vY;a5ifV7~eCUuD76FTDYFenWQp!MU5fJF(-P zo<5;ztwjfwb`PA|v(to%j^H?%JM2Z@CaQRmE)pk|X2{$>QD*Nf$R*swn*ixAHt&wo zJE?~vvmkH&3a5e7R+7LLaQn-_QT>w4>mnWIzCoM^Qou8PlPo%QYp z2p>q=jmO$?-w2Ji-(Fe@1$-Z>&1YDdy4@B~#Of0C;1Brtzsld=p<0v60Ep6!iN}uK zdDaQ^wBO6^&HlRJ0W~yCesFkoH;9_+dZy%)g#>=*LdSNP^3tr ztm#DhJ$R*SZX#hSV=mt%Jv?g5vVU8v z>f;7<7$WY(&ki)T=Ic1cgkiXYoS}>DuXDjw+CXvUI&4|J^t3GgI==c~(V3rMGwsU0 zOUPa{_tgG@RPB`%#+TPmL|&+oo&WAq$iD;Q({wHKHT1FHdfUTg2gQ_Xo&VOVeFRq7 z)v%7xr%4?Bz)q4gj`VsTyW_l8J{BYpzknj0lI&Ple)DxuvYb1gh)xna?hMECm8Uik z4UC|HqMzp2Ak!cRnZ;rQ`qb}^vAE^K7a?cJ(R*??2pP~N4Tl#Dh*hb&&f6Q?W|1r+ z4N(XSHG064NB$@+J(ip3YY3vMT6t`z<)R9%O>XqKngXk=-HL^A3Uhh&%b+m`M|95a zh{+fU>F2eZy%f`nW%JyS((2A6;bk>=KP717=()Ew-iTVp4iqEWf>5f) z`0X~%fPhfqEg(>mPNO9=Lr>6X+POow>QHni3La2`5$UNmBhW(AbBJ|;Sg)1*F0WWD z%OZ9;yrVo7JEr5hzOLh9UI)V(IB_pauOU%k;oQCpc@qOrompLwQ=U`J)M_ zsO)5Au<{7_Iw!G^t3ks*I~-GNQBlTKZg6;`yPBquYy0VWj6c_LVy}Xi=)hvt($At#?}AX zpMt5HdyS^_R8?G{0rOx%lE0T>~AkH);+7LlU?oCLWzLRqkjo_p(_4jhj&sR}q-T0olC7-QTSGA2d#uE+pzJO{2fsW|t} zFf0c*a5P2+qOTlLr%>Vm+$40|WdVS1^3Al>_U zKj_E--3Gb&3QQY)XN9dHV*ddZMvMNPi%Wt6OL+3un*ttB1pJs^dcw{`~pgx4AebMKYg&x z1M!{Tw!aC!1S*(ErKCx%k|%t9Z2he4EfGi~dsOJRGsWh)bilIw3n|+zi6SFoiALNC ztKWV=^*`C#;7dS~27AS77olWkvU@OShD^p6{_6rvOj0(0R28tPhP^u~K*|pZ?s^64u#4-z9u?Kf>r=?OaM8@Uxk&3qSw;@LCm}L{tRrW#YfH zBEMHw;ot&eG_brt{FkF?{`ri4|HO&d<0)!8mn(z*Ne6Pq^V(-s13-Uz3xBfTq14@8 z8w?w9p|P<{;Gv2_(lQiTSD=E91epCZh=adDO^_LW(+U|Ydr5@^0p9_X^mF~5hyN;g zn(V(Ut#IPdG3DU{sSxli1-_aq79()6Ns`Iim{VFOnUqe#HjP+awQFi^%?XD|wn67x z09+5mLQ#EGJ`2@B8Rc95@{-j*X~Pv}78aRf$B!E{_>kB@2triL)XdCG%Yw%`pYdpA ziN1k>^UMGwMpi>dha}EBZ*4Q>;kEKRO5!|Qa}JXLQb7m1gU7qkHQ)R!_-{ZDhfV;+0{rhJ~XIs16~gl&H8 zJ{nnBHI?ir2+U}KvFCU)tj}@Q8#t(M!(jAe*D!7E*CE#G1Z=4QWZ*-bCe(x#n6%jI z+smtD?y(ytU=3TI5!$PG_o@CULanQLBcmK2@6jGx06Fdj8@Pg>F31{H!8x`J{hEZh zxOk5T^suay+I9*bZuUC>J~+hscw+7b5mcE`Fn70kxxh zpHWHZnvYMA6qk)tWHaqYx22L^D5+c`Z!&v{BNW*6d@6M8<+dEI3++nGE3Po`>dDdG zG`wD0x5MJeVQo%1`zzUgmq3;Nk27{KU@LXbY+hLS&3y$BT=7so;^ zY}2cvMrY2CK&8=Vl=!qj1Bw8zNqcoAW`l~dyqK$*nq{|+Ki^AX{e8<+=TIL12NznM z{JI}#f~;aD!`kqL-OgI#Ld<)TXEpsHKX?L*O}89{n765dOWN)CAaI30NWM*V&zy^@{`?<{rTN>?j?XEYE${c&xf2 zN28=U8`>BEm4OZb>+DO!BiX4UK{cy2T%nU-T2e-KOK=Q4`!*RqB$KfqUUaV*q6f)GhSU(a|n>3^>%@Et=r%QngUbgG9(tYz&ulI+%mK_ zQDBkz49e(go~9b#HxasfoBh|id!PsT1}g3~+T}ii%qqeQqO;Z5kW8T2VrO1_Of8j$ z;&aE>Lm{$UJ^8wIeysE|F;^jyDR6+c_U`y1VkAW-L+djlR?;SoPpq;53pRur4%(EU%#e7MiUtAfFxNIj!H^}JtPB) zN(uDXj7+LIH!L-99EC8b+jehvt*kDZndvl{=g2>J)>(2tdu-3m;FvBfXxIIQz5(OR z-4Wf_wPg=4L9A79_=T)xNSl)eO7E>%t9Y5Hl^*C<2;`BkuL4nv9y{)qO(A@6G`(X)$a1Cd| z)f$frx8It}d8Ns&Y>ut{-dXJ?*2dTgf-yJ@c+poo z)IFg9Z4o^-J9o;Hc$j+Ti&~5@F{u={dsLKamCap+D)vK^|*j3K|}*yakv?5jJd5#qw}4L*dH56RtEj zlK?-u@{UeOXk2-X&|q1()mXz$;iX3(}HV+l&q`7Ni9)YUh=L7Vcu#fwV;% z;Lw}l1fU!kEjCsHgJDQ?DHbND`X8}C8)|jDIcQ5wlR8->map(>19tm8GAo-3LHuxU zcW?k5(bVOMy|FG~MbD#qg1y4^q5-nwsJc3O$WgO({OumqZl$z4q)H>`=JZ6n1h)e+ z779{zGPz<$yu(5kJ9XbmA6~L1*`M8=;%Ol*MZ2~sQTXaBl)@0ZK|wAm(wN}y;n^3& zD}!s zVX{3==2GpX76<$g9B?sQcZ;Zuxk*^KOs_3)n?Cff-N=m=hGajEE?jR?0~_^X@LgH_mjNB@`dQy= zncG9l8rr{(#g(EU;X)G52fgQZsPtRF|6#i8*|{(YT85ZWBqs4$Tom;UlNLBK7^sDT z66}a+H9?SwWNX>6cpJ>;EH;UYi zOX)pHUxBTrNm~2WKk3yA7-b7;GlKxq`$lO$leMF92qTOgF)9nudSGNS%{1xM*hm+i40C z1=T!YzTsty$c`N{?AVgHw?MR($Jm~h!p9{Q%2HLUYpHsCCg&w7eQ89i>dS-kPm0u9 z-qajULE-H(wO?F`yLg`Y$s30lXXAb!vO-V2)j-KS{M^8UeZ--{8%M6PmB?-cvZcE8 zGT+kJR)5SH_L7BLN?cw2so9QGR7{gN2gybYQlx>3hOi*%w9{~y_QWa@Q5ZYHFzO5} z)Zt*UVTKNCq{GmePAMC6VS0BL=x^B&fwc$4Z|Dqk)%(|heU&y#BPYWy9j3<|;4JDw za^oHVET$KU5U^GMM!>cp)yrF5;-_;(nbCe`UBbL%Mx9B~y6-yD8>~vTK(Xk+^|oj3 z25`9o8A?@yNlIro16l}lUok>FxFg}uyo5A|;qq)_oLf)yNWgU%#y*nch^EunKIqSV z%NWeS%L|@+n$zuZIQ#tQ2u`owbHCNb2(3DW@Yl-5Y8924)@#MQRH1`!-fU#`mpuO# z*QdZc!Xu9^@=r4uuJU*`r>+{_)UBhSx}JuzhZ5N#YVL##0VU;-|9N65f=!M`M6y&? zmEpKjLx+oG)v#Ptsbf5A1$}A3Ga{*e2XEPdfA8e}0JD3i9D5_E7ZSD^V~^wFsA-6(OWZ;xESdopYVu-Db=4cm2w7?x~Wy{ zo=yuE`$E%h+N%|pnlfxx}7O6@MMY9MMQfs>GYJ;XLt0dFhc@RRSrPNpM-tvPLe zdg+@>^6^9^CcVp*{lfJ~y587&07|a=G4Y`q|D!px#YwwKBC)(RNi0b6q53yF&7vd# znOkXWet!$>-{xom?*%o)bu}KY_V~!?%ei?Al=g392u8h3jnS7&6!Fspv0L}|jqj)U z8!HdPc0^@kN$ytlwBFR%-ddG0BI7SmHfSn|Ir)q!+uUL(Yk7l><8)x8h%P&A4&2rR){sSRpUZb0b4Xv-t16vuYX+ z=9$B9UVi(vkj22BF-tdJ?@AMXT(;DsazJ%GG7n#Ua$njg_}#X1@6EO4H{}g=+_-YbI|u*t=aW8_wmmFtiPAjzC2u@m4Kot6!RtDZbZp1 zFRU2SpaVsb0hS{$VK8Yh$!uqC&Ws9zTTam2l81nC=Z*fl-Km9lnjurW#xFuhxs4J^ z!^vNvwu&a-67Y%~&l{Bx@}tA)qqz1VtwVEmK|^Z@6+2!;J&>-%H1ychqH$~I@$gpE z?!zu1_`aWV#%nebA1F#_3DDyIGVYpB#lAp$v^j=%AzElHU z4L(b~&q8r^OC0H?1_gB;*oMe&&5z$I*UO*#?vlEJW}G%VmrlAfBuXYGX zX8w{j;#}a#IV13aSL_AZt1K{0cb69x7ABRh$Y~lD?TBXojJuUw5@e7-iZ(h&7yI&R zBzIHJ*%m^sZk^7tW5-&JcSMdA1nU8-J60?Wq)yIjNMcWtic^&5V8VpU(2+1~EQIa3 z?QWSARU%%(Y;=w~N0;?8%3)S2Z;H|dgwg%&!h7bmw5hZLFoGS5hj zRr1_;?LBrI?0s?H%H6xEc~zxMB}ymvyS}1{?n!&0K>>f_X&h0GbTlnfd1*k&o`0TU zdi086T!G%L=~sGlCeK?)$&%_ak^b(?u=2NqWLaIVIo=1L;faJm1eI?XJ>IC`Oqj5n zHV?!hJrojR()c75$ZAiUjf#YE3$#h=_k6=)&>ZV-t5m7Ej!wKb8MAqPxKMqwP_8Dl z8U_GHdvjMlB950l_N$-KPcy*}^4Ox|txtwvkAyQWBCISH(3>lZHPAQ;v7L<}oe^V_ z6Ci}!!Nd=~6r~68Q1FbEdK!#XJCC8ja0yZb|2CVMJ&)g;t?m8>7aNkV&h09?@vXQ*BCaQYnnZ`w~z>}8qJZz`+#_f z(g({pc?}#i3D?1;`~E~0q-9%^wU=mX(P!2-bHBJ6^PVnBOL6+XNpBV%eVm=5XbtqQ zvF-)Z*5|rzJ?9hL+qTUwgHGnX<|*B#j*e5~DvyRClw_^l9-@t%aL@CYwaVYNvT`Z~ zpfr~$-Ot%o(zEb3#<1-2gxYVGFWMm4KEgLf>ujc)7vD>avtl-zTR1S3!q4udcnOw) z{(B{F2Le>G%kr2Y9=f1x2VPVs+-}4oBl-KX9eb`D)`*W@4TPaP&stsJRIaYSv$Yp{ z(dxHbw;+o88Xj=px!+#Gt=7v0&jj-q(u(Y?0FHd>az??X%G3SKA| zRrAc=6b4O07hmbj;eyriVO*)H5ox()x2z>B=6^ZkY);>LOcajP76)sWo0-sH5%1zL zA!HYS9m*>fWLg;jW~mj!!Os2-3FNMRqh;T#az=`j&O8x6RA#wQvIBD-=5y>YD4 z$eDd$_Cgu3Jp&l1=0+) zW!S5W2%wxn?J*U|(Z0CA-BBd8$e^)yb@(Bup08DfkM;?gI}>`<@L99UmfQiMq&GL( zAnSq!L5J1WVHV43-&u9*ids?rvs6sxr~OnW06|&L0=4yf#Gg`x*kohO3@hKLzl1@r z7HjmIA!I|{UT7f*M%HnRVQdr1{RYr-wO-}dp>~j!4ug9@Tv(X+9#>VQ$Dp*khDg!G z;0_WKn;3+9`lIm8V=WFK|7@lb?ryuj%<}TE=Sb0H71vn7T)fypnUdo+DO2|JlCh_S zUe3k0U1doxRZDkR*+gSMDoOXzjjG?ucXENcN@phLUzq*>`IT_R5t^Gy)uS+WfgM2B zuq6aKOcJ3ko!x}fVGi{WGsM~{K>_-oY!GGK=159_!3;^drL*V1#XL&SNwRc-ROA&K zNHbvb4KD4PszpF~wMaNU+xjqX?p#$;ZO{)k~ne&!CE}W#qH| zD$1c6x7mvNb)eEdTdj^=6+n2wF6ryoQDB5E19=56q_eolZPRgZC~;;;6y3%2!-Xii z@T?92yLa3C>;+~(SNF5^9=F-RCAj)65^G60_?JGM{V2m>s4f+Er{i4QQyJgrPzAul zdpW4|(oP*}KGT^{MNOWq^axw*hXh}XhQZ0t{5Sy@44Q8(S;&BIYHN#ta2N5N+TH~3 zT`y6AC{wFdM~Uup{&c7g1`5yhM4$rZW%bhz9Rb$}LQv zx^rydQU5k!jMcoz`0M(Z1OD0|dagNXF67y%i=ouU4IOH8a659>YHy1i8tyEKGw(RF zFBbZf+Y_M*ceRSKyozb+t`m2FKrnqxdwy?Lr9qG)lv3$574pyp%0T@ipIL zyi&SaSN*F+#Hz3)$AkJrt78chi`kLM7{jzFNg{U&q)$`gZtaNhe$hKJ-e)Gin?z+# zdXk1AfRb&^0r$Y%Dpc0M^U+whIt zZjjjAhA}!iyD*4@bm%)z&qva;8w=xz)d>)nb&y%dh|E>u@{UU@Oj)L@#{>PF=jX7t z!PxS?ujoR#PC|CN0GrgIW=&yVR>F+F69O3`divie*)fOj>k{8f)w;@(L|(7CYCCS# zS=PZ2Ea4aF`O$+5=pQfC#-~;nn?s011#e^smLsU>fU#ZLO4`=ZjFC_+OB?(0uEMm2 zV8m1vc}|8yv&y>kcD3P@>L9y)0S&_lPUX}o*`aOBWA)|5!%!ywci6sCIH)t6P3$G7 z#6rx;>`E_`TysZ96oORt<+JU6bIF;!2=!OfyR-53^nGg-D)~s)L7fUsLri8b=l$#~ ziZ~b9^vSLRsSqaB49V=07m`qqyIQ4U$fSw$IO{WtjYl~zNf8%=c!>^4u)(;(n5b(5 zjPVay;xuFvsK~6|dYh!4>{)tP-B0Ne<0SBY7;qEkV*_TA)ZxVr<`~NXdOaH6ok@jz z7Yv9}wnbhV5b(d9J`wg8X!3vly|oa3SrmTtMWhl`8ys!@&_nBwPsbH{zHblBa^2D% zcE0d-KaZ}&S)k~Q!|i?*?xpZMm#A!iujTVTx9h*Uxcv$n_(3ZdWt+4Zv9_A2POp%>;4AIdchvvyF8SZ}{`YR*|6T9@tGND7l>+tu|DPf_<&$!NBqLoX z?rT`8?2ZenwTc`6V$f1nCUc2n0LxOpqF%AS{_lFikp`N}hC=h@llrrrZ%(W6*<+QR zVKUp5HMNKf5XDdkRDKpa%}0gCLyCloyvKnj4kKD8^+bmv8sHRIlXlxTa4A-6F@8I9 zKiZc2EZ%@RO6BX+{6bm-Q3o4fwT3VD>;ip)TcNZ2SKthzfGSscCiRv2-`1k}Cm0WN zB2k%tvSx_nU9{K(A8?LjI%?{$Gy;D%v)~pt<)TJ6q&)FZ_aSG zt>n3G#k7sNh~LKa`h>;oW&7>tdL$DGy|#@P_X>%EK4qTCiEv-lAEDNyRXfSFg#@17i= z0)&AOBe45@5)Gl}mcw~Q2jW=tJ_cqgiaTgAH}W%OE^684H{V0XsJ#TRKy7u8V#itG z!2^cl=a^qWB!WODz4Ypgs8|q3CglY~!%gvNXx~6X)nDueksb!<6bb9KGn)2la{Uek zLDnUrDeXJEn}K9oN8?HigBMbQ>%T;R|gWNC$6N$YyqkfP9Yd%dMY_yml`D<3+2m z2fU>Uw{TN15)W{CGT7kRjw_Ome~95p{p#I2kX4UZ;@|V=eLSIFs4t=1jLh~bvlwmV znB3daPKW|C)MpSr{9K}~&WCsoq+DDo&Bm(L#{k63=tZV`u2dO<5u~a+(%qD%zF?M%!%LWS#0;FHvc+=WlRUST= zneXFqKSR6f^Fw=G3QLu`^i|*Ij8|$?8$iF{Q2_>ke6t1cFs=6T5>l?Dlmx;IyX|N-G_+=tuMiK&ntgd z=yDF*c7!VEPE>~;1*;?pXYuz(+*dZ21KZiQ_&d*F%(@MXg)#srA5nx}!w;R^xUKMV zQhJEwQdadOeEk zl}4O&OC}3`f&cvpbHaJu=3w>gtgknyltM`qztr|yDCkm?&0Qz(p4$mFIf};i7i3Fl zn>yE7d9c026p79*a_;r zC3GLmbvG8&O0iEeQ0hBHzYV$k1Ruw3cT7=s14kDOL!G_8?cBFX{lXqGP>t`d@p!65 z6y`P9#B?!L8@-LJO|u4=Jhor-T?no;TdKHLa#*d-W2e5u4~;cRr2d0_+B*vg0#m8s zP%+cu%3cf@R|)k0a!KBRgwr&%)+*x(X7u_zHzxfE=zXXbe)T0KV^6rh>Y)540yz3u z%1BB!9W$NJ{9k@O+saA2c~rK|Y!y;ieh9f%{J2b?o@;xjjJ9*weEtCH2cJ*}-iBGvD^KsU=ktywoQ>dW z*VnN%b_W~YkBlFUh?I-A1UvJno(8q#_LF9#EW>GRD|s1Rx3%e5im;>%&cp^J-q){o zeoKlEb#;q9c3uHB%+GcU(>X?5xQoACD70ELGVy^M&bF7+b-W^~HLK1Kr2%ui zVZ?prTqiTj?!?J9ZoQ)Nt0T@7_Fu)6iijAZtJ1T zyFAEoip2#%H#9iIwt2pC$@+VMfHPlnw5+~BZ{ARbI{NfhDt^9?PgMgs}Q0DK@r`n8LQUUN*YudlO!E}l66#IqevNeHI7e}A>;qnw90T9Ey1=`CU2Z#&Trcjkxh zMha}?$@^>k$ld%+kt;i+VfGMj3Nu}9;xgS1vhj7?s<9+2a*h34V?&75Q}|j}!}{kB zY|X)}$Tr-m$sZoq0u!}9e0C3ebK$0E(ay^*muKOiz~2tzWZJh&6Fa@nOQI=YwQ>tF zjk1&JHhxz_zW8&_Pc zkmRx)Z)(}$Qs!gVy|7*&X=K+m>G-e`I}goGOV5@*dbTAnI5Zr6V8s_M8WRpejHJ$h zT5nj%y2bS`929?bhanct7zksaobzl0j~%w?KCGwkuyW%)FnWJ zTcM^kkpNuQa-qXj6W3~=V|VBRD6F}88Wpp1wQ1o*rpbjp25?lO((DYmNz(U##eqH< z=^ij?#{jJFt70^$a|z<+9D9HA2jDsA2lhIzFJKV>nqxjy62y^z&IaggCxaDVXWQ|^ z=zD;MB4v#Yt&wwx`_ZI+q zqO1&wN0Z|aw!kA>g+pVZ>9CR6e&mtM-Av5`ShUBLvUl_E;OV<9W27^3jVr#2ulOM6 z#Rf^Fj69P4z~*uoJBW-sCnFk2ECpn4iKGJOdN-JzT1c@|I^mGcquu-rUf73qknjxM z%6XVqnf{?aMZ^6^GC=ojL3JZi@UjCwjEvB}0;FpK$)ITbZuAaa^p9k{># zm6;5LBsU3%7w20woi@jtoTp9J#Z3(rFnZ1${9Fo`yKm#KGaz>yNW3^yu(ftzq#^gAblsV7TMkfIL_6v+7m^6NdN44oua0=X zjTszqhnZ+vc;&{Win8V^TIiI_|FUMkd|xAM7r9=xzHG+c`wBV8xC z0zfA^aCcBhbvo7-%6%*a1F~^?HRJ~>fuN18O9kALiVmv#h;tooqL}+1R$R~LHclc` z^E)ah2lt%kCIHSB`@7D=@aNRO#GyKT%`DoLf9}%ATUXjjx2M}yc;vsM#aKLPG6PS1 zFw>hopSx?31ff3d47FoHvwoz#`+uOpsMFY{~11KBbzSA?PlGPX& zJwLhuddYoGh2;*h>TvJ}Y{kk0xW~C2DeKLpeDzGOwSLo7+gXi8d^Vt4?>xMS2h0T= zuswXr)W7Osl|}}9lQ1|aV?F~UnW(6pUye|FKaL|?E0t2D%$2e`<)YKsySryJnlO}` zMyQ-|6fGw$GDDY^@sD{Kqd+VUzz#Z}^iN!u7!OkiB zHrP40?(}X8FP*=*Kb4!Wdzjr*ZjZuM9$>S;=dU$fV?G5nv0ljbtWp4|4<l!W|F3QtkIW z!rO?d3s-J|Zo40l6~PSbG0yy?_8k!JuS7|*AYU6Ek#Qc?kI1;aQ7ap&qX)eCyewhF z4Do%Z_tmI5?pRStG}Fe%Kjv4*YoXkc#ifsvVLh5=Xq={ zKW0I8<18#sPW$37edR;1&JXr|MqEV;qVczn_N{zhoWYD_LI>d}3S31S_8+4^?t{$1 z4=$Cadq&z1M@O`ZQFH$?cwLRDeeL$iklUGAz4?`y`QF?$Dvmvnvy)|IuVzr>cgFaLD!GGS9EoGUX?qvU2-TMI@fe?W*wGm73J{<9$~jojWdJtGsJaGF_Qp zO>qugM1MSfhrW23F~3{6Sw$B;$7VnloQDQa*gsUrwY?3h++&xf618`bg}3(Jre60_ z9o;ng429K&IwBe=!VMA_Qmc5!l&Y9-P-HK#r3z9o-}=@ebcVm0(W zE!+1M*^6J=hSuY4*E|f;$VPHEP1+Azz}@Xs#H_XzPa)N}N}p}OFIITld1>|eC*(z+!ITd2RW=EN zT2XfM5yk|#W+JVACZVnDzzxTRolEwzd_Yz|@Mye%+RrECRJb-KRemkt zo@%wsFW-!PPW_k4TpI?Upj@=o*G0=ztLL(welR8RX0vHedbn+`o!q$aH1F_R|19U( zoRfihe7MgP() z5^wO0-U-q5>3NotfpjquOf`2|LXCyp@JtIc3CAGaTN?4w)|LzbaTdcMke?QmM%3z-dW{}CTZ3>E>tMpB}u{=ROS|}sb zfnuVn0Pwfrshqh~?t%g{GY^Y>+#K*u>Mw(8+L2!`!8*9?=_97Me8nGsD!tG4Pxf}I+kvS8^QUw8uK+yRr~rxLt4!~R^vyL1oSS8QPa6Q zu4*DPWzyt1ijy~4;s$~d&2j=9MbQ53Ya_8>HxXOMmFxnL|Wxo{|6ggMLy4X^~MF*xez$85rmJG!=BDvUDT z{0x99#kpL}s~8sg;AME7_Ae{*5dAXanrOR)y9yw5RYCZV5w9N3a92?TLg9gJ=5wrt zIC=7q58?Os**^JmSyW!*S-0oHyeXk(mg6f?a65&x`8}XER;9pXy348PJT^mDZQ)5C zn=|LI?>rwqK10wMes5#5uygdC>{wM}9FJ(Xr9!$&$MQMV`BdBHv?@p|hw5FJ#{RX_ zcaNj<)`hkQo_c0sbtlDU}R#1b&R-N#13TK}pr&KSYqb)*}NB2mcJ9I-Si-%w{IWSQPsO^ z1^xNW8fFX>`dzn~5nY1DHs+xzr)13j=eQ~pg|~7{H{MlVJ(pR&5NC4YmemsHh4Z6B zR0h^7i+B^>K2Ck^-9=7ir79Pan>@b7MlXL?rkTfi%n`mbo@Jl|NBstfcgIVt#JT~_ zkrC7w@xfqrf^muan=xR=19N$Ed|XkhlU5y3N_;22>~*HlG^o4BZIUb^X)rX-*xM6< zd;e}fs-Am2P88|fqaYDOP0P4 zvYrAUmp z0MZWmNtW@I`#Dh~Z-_i7N=M@p*KKEd>uf&z&0hZJbNBm){WNqhV$+*mD(ZkYZvqr| z02P*aPCAmYRgYQn5uF;Vky8t3d{ON97Wr5sVG5>Pq^CF zrky_LYGxj@G&gKZ2pQdu8CqCzZRJ`pgPaNDhp4GQNu3s*?<}E43ktu(Izk<#Xq>J64`!$eVf) zbc*JncGjArbpGnVddpr_dt-MCc)y+Zu;xTvTb^q#3}*v~6=NYSO4~2!0l3=w$#PM} zKe8oblN!@f=2Z}oR)>9`hF)n`}0j61o=uWHw0_VwKQG!Rr(;PyNe|$iR-@W zvK-uh{XSJday;wM(^0v#RH(6F!=-_J|G5Uq51WKxGtz)Iu&u0efcccF8Kv~WDNjTi z@y)fsEFWFi0!5f*8dTZx_kq~Mt^LU1cyQ7VO1V8)SZA_|-oi^tPNP;V2x_hSa2?vr=k1B`HTc|8fMLoem^Q9M=4@~>i z%g(O?xs(1odH-Z+gnE=N2o@l-L|gaFkjw()JlP*g9dnXGdv2_`K-X~G%EH(y(6z1Bd|R%lqi ztQK$1RNdjqdnfAa?v5uR5Fqmf12uT&2U!{Jn&E3Vw;)mnE)-tKWPvy}8ZwAvlG;fa zq9k^g>qlpkF~o|Z!DQ0KgMap_OK=EAyLI%cO^>j}ZncoI^)*FT5d=)1{Id7;Q$p(h zdhE-5`TK7cz*9n?f0taCdLOVbTJuL#tIJghQP-KEqN7Lqn$k-$}$(x2e2BX`LQ6kFv&bv1^XQDoTlHR$zbusLY5 zu*7Dm=NFZ~4IZ|LK)?Xw(RUziESnNyKwU7OVKdOQG7zjOVjQ}(6?v@=MGXL1nX@M_ z+C}~jupG!!blI_dAA2rL4|jTZijV4jT{lzu&T1#z(QIG!q3Vxuf#%ud(b!q!^dZz` zzIWsAtq+9OF-R$%Kyrm3BxnjhxzrgY5{%F5?K}$Oye=K*+^^L%)f`tmFbiGKfn$Kx zl1}=SpaOwsUxrz>aiix96e?lL<>jb2e_hjrK?hy<>PYF_9~}q4tBYlKD7V*uIQpfD9h-0K8tpdT^v1_qG1!9UO6_ zkL)T9^qLMo<0b2Jy;sNIfQ4WsRV=CXWTUX-*9l=~ z^G6Q#0Mm`NgXRK>uW0dTk|B8Y3@8tOnBc@9>#u3x9KpCRuVsRH$Q~gX|8J?JxsUwzkc4YU6${K29%8ul;4L z{OD<4G-EcDf{(_2H2L+@l#^N1Iqwa;bBMlJ;dJ@+mM+`*cq>bPl`9HLJ$bnMoc>pq ziF_1sxswN95jp;c!n!o)E;E5YvOB8HMPgyk(E}AR2sE^Zi0KU#u0d?^BHg*rQ!*43 zxx8zUXAn0da2sNXr#A(R7$FYk$J~h9LT^t0h!-~S^KuZ?+qkC4BVD>FBac33;-Fij z?m6j2ZLJ&P+wI|j_>nOWvf%rOEDHKpxz*m^y0Q%;dt!Z`=HmCV_im`t{=Dum+~qJK zJR+{3)SYz2{&Rfy$UK{=K$}+G&J^j8FdAjeZ}iqav3uCb@^)>|E1b>WcH}9ZN=OZTpYK$4H5) zOuomN46mN zxJ#N{!kADGze`%F>SGsilG2}Dzty7Ua9q9ouG_ev^vMfI?Etd3Llm9_g~)Q_H1>go zKlWeS892kGaeZcBc$ixl0TSy{1|Vvf_?m*p{w@k<*#sZ*EGWDoDt%cJ@j6m?-R;S6 z(+K2rl@!6Tb=@{u-uYB@X1|lrgdH(1CHoz)HS{=+NP{p#M!V7vlv68 zS;7~6hIZ76p&wLXrrxpIErY7>hTq59cQ%hsLhP=;F?MknjA!4{1Ai4X6B=f+imNBm zom|c;?DTEr;zkU;v$;z-{Y#37XYIKTx=^!Hdp38O_Zm+)Ps;YM28}koAr8VQ=s5`~ z=?5=R(ZIGd?F$E-f3Lj2`s+I&OG4jSN;B@uL?rr$3cbF{M&RJ$T#UXT&O&GV`#z(j@T7LjR+d=8zN0X ziNv^?(K!W2(}f$P!kCy{jHEgnNd_-*ZG`t_F}X+B%Z=CHb+yraqZ4d6I)DFKW<4FW zC2ah?I^EV>DJVMAe^w=2`O5=CKkGLDiq$b(g?E0+5#8#}kg+jhWViDUZOp_HCtTdy zSbw(Y-KC0m4^%l&mF`4<5S#SDh~RLK>-KgJ@gbJ3VONZA@b#pIZ>+k_cHg9#_-@~) z^u%O+$=eYhT-2>riWS=hOzjl`GJ@#o1%Q>I!1))^T5;aTBqNdCg7YmXjv@RJqM>eR zc^T6)|3t}Vot8R4?wwmsQ5X_k`izH^+gu=?&Y%DO+C4*NOK=Sn2H^6T zUGdJhP^IJ^`L&OvqRop!7F+=AwNPHU@5&7m(_#+6rXK4*nCW@WAYKx){BFlG^YcVy z#V_jE{-#ys{x92J{5?zfcx-PvYrWcJ$TDW<2vtmL@B+BoE*Pw4)2wRc?*HVfyfd_% z<;Kr1n1=1*m=YMZF9j?p7xEP-E*Dm`(=4&*p|lH*#6 zhdDh!*2|ty+4EgZlVr3?=5}2jtncf-cCJL|-+%u{Pd}iAb z9ZzqiO9HhNeMh`|>mVx|N~g?8waW1_{?Z#9nS0^8ttMRZZNuLfgT3&|{|Y%h2LDSd zCmDnfX7?h1gR}JO4aIL-J zvAAKi!`TOICLBWwnlisai&4E`thFge-n}72cwo4IZL{ShvWB)qW042AcQ6X19B(qNk0jBHgm9PirlA zEuYauhqZFT%!MlIBdf9}`mPSlaR}52!I{k^CLhKqnxP#j@s_(mGptq3Src0HqocDs z$$Uoxdzj@COtBrqHUYw2s>yp;@oRZyNlK^-d@d zS<3{NcB7l&6Y{43c1|EKJUD!2*s+|mH{h1D{@t!={pVb?-`|3wCM&gS^Y~a6@A!b# zt#)^Az&a&_n{BEiydSWB+Tprx#%YGV>={l)?xcN@KJ}d#+HOt7h&yWJ@p}qxLR9E& z%ChZF0Z~(NcszO@Q=@g^Mp_Ivzlcmh0_Po3fy<4nyty;*`nN2DMC*ZWM(NS{)kjWA z4yHO2FNl=K<5+RXmIh@EZ)j@hbQ#4?=W$M7Z|>F^n9g5XKvnpyeu%aEzcaToO`gsL zXDIj8v2JvVuvItawdEY0Z6V{yFKm~_Oq6z4)^T6yR?H=d6Zm^Dg_%)o)@a)w*~=5h zO|mL|L4y!=a3;^hG*@}Z#B`wdCtv2FL4Pk(zO~7)KthGJVYrW0lh+eAL4~N29n|UR znp_hu?RG_a?Dng_{gex&Ie8VV<<9h^4N>hQybvwMP-nqH$u|DouC_32JE`IRbfO8l z+^Xg|>G^N4XSY>d*mbKib5-rM$>qqH-F4A~(q(C-ox*q+V^~PF?H!aCccq`&nv?3e zAHJjw0Q5+%t^<%iwbG<}hL~Wl)m+jbbeqqc-c;3O>Z@%<6^aO3Oq{4zs)WlH)rMZb zQ(*m%pHvhYUK2pK@|B_L#iJS6xNadI)XB2MjtUcPelq_3c3w7zBbm?6R(-8b{*BUS zi_ab@ylopv8|$xnkBC(5a}1Zp4uSliy5x<^pVy7!7m$b;0iR||419+O@tr`#*bF`7 zs69L$%Xerc7}D^kNu#RGy#;04R9j!MN9=n4tIg43YE)0_*vBV z)J%X?4!%r&e#t0W6Ze zHTD7a<_~AjzE~OkRFH=b6nt2&8$?_7_T~PPVZP0?cA>ZVoYGS05^4uh6|3d5_x&;r zKPD&L(zEix_BkaxPx8fg$!q8qU*L0JdBt-Z8+TEqn4V!ufLqR$(Tr7&8?Oh1JYNit z0?LFw<4BR`P&HIJLlX6ZCrYdOK!7(i>=wTJ+C0@E=yQ0u`mwOD;2EthQ$wPXLvlz} zm1KFp+aK8%L~Pq(^+MeShf|nCL>taT`I_y`kcOD1W@neZT2W*xG7U5@+@*t)6JqN~!TgR?);pQepGXRs9atF=O= z$vtL^qr`tZ4SR_}x>#OHqVo(X^f~DzL5TU3iw#ad_CE1Yk`Y#E`S%5#IdLWK!RMTJgv#q&KEVoa|T;>r41zpTeFNFW>T;jQL zark`LO=VJ^@M5Y}H%;S*e7U0d0$G86MMT6|_=X2Dc3s}E={IVDosX4Y1TX_)(o$|D zI=AaRz+oAPyq*N~%Vox68$=#K5I3g*tXmH6sD>)hG2S#o+7gi{y)&PID-|YZ$jdcIY-YpvkBh7@lb{U`r>uV))}375476`wK@rUG)*G%f=-7|I|315*C0_ zqG^?NL@m?dZA?3BlHIc5J;TvFTnQRX$RN%} zJ)u*x9+PAe{QJLXotM%?Y@zqzvNtraEtqK3Z(JroD~XgFuo=+u>Q?`uPYf&vrkuJE#D%P$CY6Try;uKgZ}@ z-8{m-v7!jI3i316zH(%w^K=y2_rf|;=M+cDo!e}omSklc07b~q^!6M^4QJ~bAaZ1sdVDiSd7&Jc6FW8{vNGUpkVDxCWG50*n5l@z%;g% zkP9dG5-qz4E=BG(1hj75O3^}J9(Wx8^>SAlU3o-n^JD&EKhG1Zc2tvU(u(l^*TLn_ zy~f`}X>8uU^ma)$9$1bnqSeO4)$ljnO-Cv!C>Uh&bDRn8`A0t01e#E1>xSl6&(Pvv z4fb|Lm1wg0vl7xUu{HWZ-Yf+TdSiv&!-$Cy^Z9r`M#_LK1S6s247hpMzDl0xpW2wh6<`X zi@D0&&0U^&x~^YlmJO}a$Y-ZU`-|5S*I zU$wFSeoL~K_(T1pX*^9@pReU@euu)2~+5K-X;JS;-LKI_JA0ZOx_%hT;YG`MHP(xw{i$xmeXdwcyP|Y;~wr| z4+F@BOT=f1v|MNB-)1ySdGvfHRs_h92(8WsyBC#&?JdCMoeo|{Etf-Y2`0(>2yr0* z=r`n`nThEP$SnlNM?Jy(`R|7sI}%S!jZjKLX;r?Tc|yNkMCDl=-P(nkqZ+kI;=FI~ zE0zXUGD{8obVXE_1wJ7kDW0QcjVVW!l-5vEoCc8%oca?SPyH5l!4XAz&hh)cvWd0& zbm)pN@-r74+_gxkp>DamHfYa?geR(2c)ZG-6j?hmcwY>xDmDN_j%^skIaww-NW zvp%yuvOvO9v2_fO7sTDCd)cr|+H_aOMY5gm35tp1vkNH_I*=MIRf;iO1lXH_xIO@q zBREby-%rN&O0^%AQet(fpBvErqsqNVZ)6^Xcaq;oRRS3*^XfBT&(btS^>u&`T9?l& z{X(Z;E&Znt&}FZgy@U#vpTvH3=1QNTjazGuaC&9m`pD%IJpk+&&x8&XssAJ-GxmwO z$<-wpas$l8-LeRQjQvZ@V@^B57GP@@!SfhN;vzaB?c7pPj=SW}8fiwp>rG1l`)+n` zv9LpQ5GS1);M~;Ebhg#{xJSzna8;Kjy(1)^FmoM_x%p6CmC)i`6x?2`f!jpsM_tE5 z_5xwo8?7MD@ySg0p{*tCxg8EGO*Qx2Zw9qxi_c@sk_CoNj3G5s%@^u0!W!w>W_Yvx zQMa=YTASU0UK~|2lW#8#w*@tdB1*a$d3p5;1Z>=`9{+^x_p3{(RC9-e^2iagx8No; zZY+XQr@?1)zd4YCwpn!;-nmhUNX3k1OVUN~chOwoEjTqLV?#r+zA3A;vCOq+><_I>^5DSuCz+7ch^=tvSfG1y1ob>$&d2%Z!M+GbXhOBn(4EpF25uz33!z$eQ-T_0q_-M`gBE zI3;>t1N-vlQgsnE0kSvLlSdQ26WduErEQev6uU1t^0kqhp4mL(Z*0mI)lSM{rI{H` zJWEU&iC!k9Vn>vVYDd(c=5Qu(oPb_;6Z89S^yBF>Z3{n`3v361245=O$tJtlQd~%m zE439KL6|Wz@}N1A^03*o`#*7C=Z!UI&nWCo_=LE#YMt#o4eV2$(0NjrJto?zn z?7jn`g_AD+&~)|${17ak21DruSIpOdEy0SZN+UWgw6fou~w+b#fC z$K@J;qBG7aVJv=(S^!uTCC5)KW&vY@J0LOL7ft2|Fff}Nk)Y?_3CcwP0Lx?Y&Z8o= zls>?@`4R9fi6`>rQGM{dA80J@SHHz?#XuuHe336K$)>-6H z*x5YKnZodFS4>N`jT@{RY^S}+vVIfo!<17Ku}Pw+0{xC0Yq>CE@%wAI;I zSN6YOl+)LMTRi#|_?B|CeYNPo@)x%#(A!}?nxoi9>`b-Q%*gx?px-(HVcl>hFDK=} zAsfAw;+(O3%CNBl1Wqym;3@M?5qj}sv?J9kF1zGaSr66+Aj?RzXCSNm6ljgL?PzKU z=-8&cBBt$-|LLJI$18Lv{GmA8y?s7JNkSX;|9@miZ4FAAy>!z{`1X*9?( zNj+h*wBc{s%%EJ#LuC_R0x#byn_-*yl-&s%@hFw9Y`mXC&wdOSVae7tu2(#n#PPH) zQL@?KJL7&i<8a(XM09&#&Mt@?UpO<$1U10wYS3b-{qR}jd!2Om z%(HVNzA%m`5F~UddovgDhLz6^0);KhG6QhV(TC)-zuw-bANt+(lMe-of+U2(POu6a#+ELIQRCLAxECwyb;v+AhnrnsM^nuLG7}i|1tZ)dnKAV-hF9Nt=U4DNp8Hy(4lU(s z1^Q)FyQ_uXWLHB8S~`G;kD{m@xB3BT*m(u1Tg~69VJn@qVH&YYOVRzug!7cpo=C=# zR80*snZx!BB*wWHD7-RNL?IlD-}-qOF`n1V^*-WSe#eh6Lbuo0 z^L=0z!|Jrv);7@}*47DSk0`X0$?|&pV6h7Gux0xQ6P}QJ2@+`bcKpmq`o%b2y8L@B zX^e&;d`F7{<3w=&U74eg5XgZ52LwNfogC&{u|_a@Ej9x@OFJHz)$P1hc|TAQv1zdC z4U7g#g7dK_7f{*A9uJ4ewub0`XG-fXn>BL7nMYqP_BEMQaA|{D7`ih)^6oi*Z|qYk zigB_F?9R6&eD3F-z?{$$RxI^$6gB`9S_GbX_^sXD_JZyW(^l#~mzIk+-^2T%cXvO{ zQQ-dea6nXAyLX~9A$mh5PTm|rZ6+5$ck8_lRQzkD;_4awj-QnS#*JB9VZd}~XbQo; zYgKyVnYMYb0%Q~oS`j)TfaKBjIdntrA!V||QKeDEP2!u!nvB3;q0_VwGNp=K?P2VE z(7U%QubauAw>t01S$e5x7PAI`uKW{UvkEb&eT`Iwj@Xj%D3QH#{zkHe+xb>?v3h@w z3I*mjJ@Ap#&9zhS-B6SUpZ-vV6?stbpGl4E_BQ8-G&b$0pFVkXHAXMw*PPdReh5|n zG4}P#!dB^w$*jxdj{C&^5T(Y8VqQ&wE<$#nwGVT@tF6mp43Xnim75^(PW9$L{TUnE zCC4MWY061YUZFo8keSUBh;%6MNF8bZYu~W7IG(ehN7PtoS8M0=5xpa0zIh?oa^lhj z7E2O6S(R(zYRmMJ_3LJP6S_c;AHV(i!U+1z=SB84c1*b z19+}P^H+8&yu7daT9}VhzrS1FvU`Ui(TqcBblgHPw-HC?UgQ?>u-R|Q~R zvtRHGV;D!S0sHJee%+OAp&voWmyd=&ZZC@>G76j}7PHcSP(SBHmE7q0;pnv;_!-Lw zjHV=*$47w?!_C45p@P}H4+&WO$dMlO`HV~m%IRg-d#M-Fa=a~i@q%6FlTx+gha21o z80wa!rC_;skN|Z~s0qGG(hP$=4*V526;k+=!Jxw{>aotFVt|V8Mv1PMk|^rk=<>Vv zit@Yt*SWblBzAdzi0v(KD=kXPj!HfWR<;7jVa?l%_;eonE5LTeG5Q6TLPzqhHs1t( zA%FSn(!kAsR)yXuR|&qJZ8EnTYO3>^5Po^G+7utswe-^OC(sLzj`H=WQ}8;snyc+_ zk0}BU!98vJ@Hd7%>n`P^C61AH9+%=v=01ozK-}IvZtW^t_}$Ys@iVxl#q-jF({iEm zBAb@<>B;clx(`ClpLe~OA==c%dF;@V8t1d#-xP@1;7UiUsO&T|>V56INtWSxT@d2A zH%K*TQ-r9M_SwGUz4^I{mHEB)4P+_)+WLk37)PQhbpp26%eEm1ZuS!RcILY$8jJD1-k@$MbK7zfw`={voX`U4EbPKc+q-8qG~r3K*`0!+sn<9b!&qHLdh_ERs`z!a`byrtxI zpkrTkrw|k?6fFOZ)Ykjd`$VUA7>HPqT%kH9nqCAR-4pq|NEu=d{m5xHD^UH_7#ACAy~0Y`>j-sQ_sH>hP< zwqV}U;*xv0_r;^qrkqD@tJ1Guyc|huBV9kY(tD&aWKjQPLMfs`iAj+O+@u~aBBF6Y zoPc)w{zTPf4PhX^R>G6jNs~5pcTg`vP0iGghiLWvDc3E#Q+F}>veHy<$6K+t-KF2y z*ksZb8-*6@MSyG`f1ySH+x`rEZTln;q_E`LKAE_jxw>A?d(FJQ zze?I{%wQ~c5Ofzo`L9W0^1P?z$GTVND@a2kEd0E_*E2oIk)Ek^yV+CttR}Fr<&(Ys zc}A$pJDXfj26>kHeFKop$$(#lAhB8;Bt5)5An+>9$m6B2m}J|eVZ#-8`{K^%Fi7`_ zu}3%00u3&n-`0^XiT~>5gwRp5@xyY&)*0!+iO}#8bVzc@|ESrv%a!ZwGHH(f6HH3W zBt(yUDAg0!3@ngn8Z}|2D~y7|-VOdZPDNR^h&P_N(RjH~A~~<$W;RULYo~s8$u}>N znukZy&?~30+_F`<;(FaiJAG>s* z5*p$`GjXbBy@Z#(L)MN5{t5zU`m9(BkGJDuZ*Ll2inYO0%D>-|%EP_Y(^HD=ixe>i6M zN#TWY_Rx(pUAdJmqh1C(IcZtMZ`X61``7zuY;lWe6qV6fy|YvC9n zVTY~?_g21vKd(YAB;~EI5Sb_PnGJ5p5>}3d<1gdJvV6Fso15PYY^1Cqw7F=~v#-`D8-gAdyx{B6-56RzXzRECV>qXgOffOWRn)su;z~3`$S24T|&5X zH;MFYSfJwJQ9`30fbB90dT7?xw`~NZHh!S`v|yQMq@E`z_RgH?Gp1C-CTWk|#RiXn zhxQfVxKN7!{RX9Kt|#sNj+}5|Bz@@cSnQ!o=m33qeiuw%#@BjTOpsK+K zJ#}LjJRTwP#;taM3(vuag$=ytLjTo>%=91eIgweo9r z_VOFOet9&Gk^lU?{_zw4oL~Mo&+&PDlRwp&NdDt$|D!Pf-zvoa!z&?CPf+DJkc=Jw z|Nm#-9pKMIEW%v-=Kt{C{~w-BJUlNt^tk?gKmWVP`v1O(|NZ)#kan8jSG13YT7X==LUsa_45EP> zAL_s8hFW54`!^79vLH|6l2VO$G72>9pvibhH3;=glnV+W7n}{Sj*d>ASifCcMQD*_$7=fdUwG?z zuFZy1VVZ_2uPtj-60=KRc1Bt0Pmu`cR!SSL&3y=Nj(O9^{vy>uur0zCLBzjTko`B=sw@gXo(e$cf%?)1Fc*GKyH!NObeX>H zqg(Fv`|lNh6dwt-U$(SefY4b?VcHI0Vto$HdklI5gzDH^_PRij(dHi_B{(sIK2QVx zx|L)u*eJF*1>{!&yGo_Mmu(L%*=0;-)U!Yf<=&JPk>~Yh&avS>TOKs+2#Li2Zc@wSL6iqmx%{OaD;b#8GDY=8H z3AIr}@MN>!aY0(UaM1>0hQKJpr~kKicVWyp6Kx=+p%$}xWA{X;b zc{h-Gq0hOTO$o!-Z9C<6iA)~)eSFCR-4EDwGj z=l#E(O8&PmGO<1l37g7OOk)DT^UO2ZV?a1d$vaym5TY8$m6L1IUEo`GlWe_O$L{az ztB|Om@YwnF7!S=q?`EnfdTjy(J60D3;ZadS|JwQ7rm!lK@!6&dU#;JK>7(f2w#c{<-}YBToM)j)hX^9~;nVJHCn-leAi-xy02o;k&XOXa29ON+*26kn#-paO)r z?G`pqL$0VBfy6Qt*m9F=`FRIs?jc9-E8}i%X`uq^-_Iu3wdvOz)9pJkp@NQ%jzFe@ zvxe6+Kz#S9bg}u%AEx}-NFU%%)r!^=7&`-344VAm2N~T}^r|W3*8N8}=l|V$=36BF zb1(rLMKfyuF*Es<)0lXCaQ<)#pKdSuWuXITiF@(k z{jAW7Ep>msLg0S<0-c8(cc915Fn%zkb_9e-IuexzK8We{$y|9-0btq40pc4}baa>= z!!#cg8zO8It9%~rBREa$7fIRlUT7S$wR>MpDuMI`UG^;wUJILb6i+`UXcdf*T)J&; z8mHmiJbCwx_ZNUEdE|$8aHQ~9#d`tyM*k0cZy8l}xAhGxAcE2;NQWXNAt_xVCEbnm zCZ)Rt1SBM-k?!tJX^`&j+Vn>HSzPBn&vVte-t+Yx;~v8g?k%$ad&OLH&H0Np$(T`R z#!qJqyL!O-29R~SIHlcxEj^T=`Zn6};$tb)yflAk+-%AqC_=`w*aS2*><5-mS2JYi z)jpHK<3pII6jaFDvdk-vp`Vh^qw^OiLS0LL@I)rwl9M9TI`T8V)eN;)#e5)Ml8-Hz ztqRVmkv1Q=Gy$HSspF`qH)}r^FX7)RN?FONHqaE(Q^Qfzcdz*7`{8yCiI5bidU4`f zmK*!CYgTD~Y@e`5+WJ_BIQFVt2!oc)FDFht1+Wri&WWwsF*)-qguf`(x3qPTep1(3E&u^x z^x`KY7n{jwmvdYD!_K?X>2>U>=;hu9vA@%QyY9pmffTyLuU0Z9Ipz_y^?AWvI5H%Z zW#TG=v$;a!?TyP5b$igX>AC-qFh$j6 zrqRu%B^iP-v3OHELnn=&Vtd>#||5j=ceK?@e zj>7yh^cJ;Tfy2a|?`Wlp)XwtM4^Hj$F#n&Vi`>4Ai@=fML?3UE^ZLYhpv8?a1-tpR zkKU6vZ>*9x+}EwnIDlEZKDc2-zq=1t35sX4b1w?7JbeVo`d-pcLZ9tG7uX$EG2{9r zQbtn+;=Z*en{zpcJfWh{+i~aRT&SkaA;`i*(A~|HJW*!BVU&} z_RE8hazb7%{pm~Ll+~o!CzP>vGUS5aFPq`fzjb_oRK20j7f88hCAnlk9o`a;h49W% zkEsFWu4@JJPng2p)2lEZM}Er4t3s{{QnY+_H^+D0bpou8K|KR#4g3}L^#Hf97ToLn zU(WD#yr=W&{RgNyN;qHd6kZC!Zt>qMf63B=b_B%iFp<1S8$B`LiS;meHh@%06foB@ z#x;Fx6*djpY%J7w&_xOyZ@ouvpSYiwhRuiD4=PQ8QHyrS!~ySZL`a!- z!rp1~SRBYbu$27x25#wDDtWA*sPNu&MOh$eZL8JiA0HP8Q)yTk1w~tc7pH`bZ^6uQ zMCEDODRZ(P-X~?sd}$6KEZ2<{)FM9y#OO$7guW%)QKX{3b)k`Jb^0O`1eExSRhO^4 ze~=FOO@!neH=AM?3Cl;~?k7lY`>xV9J*a<>bM5p|zIz~>0^@QQE!%^%w!7OlDSnzQ z+FZEbL9|Y%OFZ8oT*CNi@5QqDjIOrIn=j1nP&PcmgIe47RfJnlkhqj(%0+OBW~eQ1 zi^ZbNUj;biW?%Y;d>qxp8o_}6#OQruY46x3YvY17Y+Xe6Nm#~Zx&Kj*a0Nh|`2|EA zq|R1brLBp}Xu_6!od5_!Om zNTQbU4p%^>hiSR&vBrV|@u|RAo^lbRZ(il;jP#{QT|rW9IW!s_dA&kB+|F)2Bn~FItrUFv)Pc6kdYt1_$ZAlYpru97-5!sn*HgF z=va%X&~1hHb%LBLSXI$7Vh_E}$XHQB{V)t^iX8!HC0~>cW4U@C-IR|{{S&-UJISZH zVI+>~kWdHBk>P z%RM)~gj}jrvdgDDc}!nP0q)mc)yR<)GTv{*G*^G{U_ShZt2Owi>?a5`@%-Ii!qZaY zO8MtLwxQayLd~6Hsmo=ohF7~I==Rybb@9pP1f7HY6JQ!Sl}a^2fS#bzY%BSZ{t%!A zy6R;y{c2*xeNk{ZTTOTon0kfLuy%yv%FG(w`3!od>5{QVaGqIMj0jKXo8 z{`6F#PASVt#A)KsAzyd-QoSv?c?it8<#x1`n_j+7k z)OgWjV&N|Hoxej&ROmY=kY=RSdU?d?Nb8Ny!p@%lOo(HibS#V)=oqpPj8)d#0VUJP z*J$ZA9t!nRhtR7MB0^ruPkDu>*-X zSOM>(p)y+Dgp{LkV7KHy+ z0N{CQ=pK2;kU#6eHpjO|U1^OK20A-Lfv5)fN@z*(I+@I{G!2^2I|-<&RWyvYAU8jJyf93gsPK+CWT`u9FJXk}nUO32lKLD`LZnG4GW6r{t02 zb%3rftvAZyO-UV>8#m3!uT`0d(4u`3XW>AaFca&xBhl-(6mZ}Ah%^;{xt7atMK3z{WD@_k2mXjWHLVI3? z{#Xfj=6S*prE(z&A&S4#N_YQih=cO9-P0)10Dj2fa!1IumyYPli_})zu?^#D2KZ1> zgV^&-Pj-d7?K(&%<1N926nP?qy>}?c6y>Y(T=z}wg6FJW{Q1#1WWe8jdwP(t?R|V} zf$wA7X_9GUA9VIl_H7Gy46}e@i`uQ1Wpz=o83r>_oc~lR`qTTnX9#|1L?(}4Dix?v zWKw2`egtliO_Pck?7~bV5>M)}W0zl?H~}N$)%xRsGC|8&verKLYFyWr2e>NoU!J`d z9PPzSMQ{Fy{zletK7QP)pJ~>|Z&7p~JO0dLrpi(q%Jh?1iWUoUe*TnTS}f828zEMW z`;Z#R$c;CHzwk|C zfIedDnqAl908YJxmznF-PN$^h-yjEs`fB#}m9eN2c?6Bcfo=~n33#dG=PK=XlteJK zsx3y)QOXDd$5ry{?S3%fOq1VaO#`V6Qx8XzSPQb$?eT*6r)W~WwmVhHu#M>dw(`V( zm#&tlj*+6u%DljY-w!#q z=6q~Dl+rX_rAq7UTSJen#>gRJyZrhB2>%_|F`}18=E88M*U4ks4=2#p{?Hd*n_lBF60>)p(KXFq&8{-XS;4)7t@#ZLUEEOP?QYnQ z$PklxZ%VrvBX^9gZ%~Ynx#1o0skL1`BB?+Ta0ZI-{T-{XG^Y}`di{rQ#IfXV z+1rE`!Uu}VniMkD=J4x&x&;44-b&|urX3pcJ)_3Q4D~-@Cak~`&S|%#vo+fOsGHJ1 zDkE11SU}ppY2$Uf=)yBfAe&s)@Xx&e9St=7%0AJf7mXofzeIXN`gZALYh(?Of221F z2_6f$=j}f=b2~0 zF4Q-AR~Q~}*c$GTibn=C?$1)By*8!1J_Je8B9i2CeBM zb5I~MWc%73LD>sdgh|Zl{=$O&i~A)axhP2xKoR~Rxd1Bku`Cd|K8e?9_i~s+zCFxo zM!Uye?tvRN;37L|JXC*Apq&-XDb zf~~#7zc0H9W*`Ys2I`kR?`-yHGxM*|PBbn4WqUfdrwYdS}WuTB!Rgdz0 z5!{?hJ0!YRe8f8>e(88qiJ_V=rJ{OlBzrXCP@2^_9_n7gKlHiT;JZb>Jb0HsdY51h z*Ev{rPHQ?oe(Z)*iK>HfK9+%$3rq+@OwQsfw1hR5$|!5GY~d!CpzB*5=kMt z>pIv8{*$@~2Ny75NQbIaJ*T{$^8<0s#PPTmP z7j~w1KxKO}B%)5=_mjk%>a$ZHh6N`S)GqSHoPf(ZLQ=S``E-J509pn`?*2Owd^y&wAc?rcXjPfj0M-@;Jx-` zu(P7nEo$ymjcI^Vdj{RNmbQj9Co5}|xWztDoj}k^MdREgg^cA!aRX`p=;KE=_-}?i zJ82k&RCpL@>F6q;!6cyEp@c+F7}KK*{`$Sb{Vmh>WJ%gx&jKVsR~=f!Q0=xQBwM)O zv^JpZOGb?#9luip&8N0(I90*}Y~{Tz}H^O&;(;{%B2_lBGoH^6ar%m$_w8BE?wY=0TM7_xzE{%_5JeMW6Kz47#;?GE}<`ajl6@9S5_BjC*Fo^&p^K53& zh7srgBZB`s9w>0YVeC2LrPeJAI6Y@hTA$rgM7OPGNG0^Q@I}mul!HL*T2c{+3uz(JVD^xyI?_|^4i}i`z!UfcZ=Kvm#@1PkT z4qX}#4^L^8>`i&|&5~BFlVxqbC^G2Wg~?0vjBUfOzjXLq)(Bvy)n_}vy8)9>h`~pM zW`y=>f^l{XUAYG+Wvr3h{_?lGB6A;cYrn}cyw}Mni?ice)Bua4NcBcS6D+yIr!HN! z_hO2tSMY)^L-a4SvlTKk?_c7mTR;onbi5?ElB8Ws<$1mA5;~k<{tSy-L!5yk#r5GQ zI%^G3Hwud(le8IG>*pg`RJlmvaan7*Ewqa4h&W_kXfZE?ldbj!8@{H3jE4a{)&+c;?J52nrgHPo>bjSw>IG^~jMbGJYqBCp6Vwx@S@+mXk0n-v zavs<>l4E~YrM1SHnEV*=u!t{H;NBh6J}@Q@+pYv(FnP{)M6%R;hHbFWQP#P^Yq_5w zP_kWW7TBS#^&|BtQhD{;eS;)6=e^${H~(%{4{UIIZCz9}G+sg9XCmAAW`iF4!}q$;ON~&Fv8^Ahe!9NXW?=^R{Lec+ z%H*FLKU!*R1W!&XekklvT2^=vac@|U0JR9<`vftZr7t(XLF=P;*y7xOqJGn51sK7@rmG_fCy>6` zb79!~^(T=HkH1hqi*!WBRz z!n3*p!q$%?TLEaHBJ%5InNACwY6Z<^9IYHUtDk(VVMusA}Cc zUuc|rWL5QXNpL|=c;t2Td9CdhIq0XA32qA$_%X(@zNkgz_OSyV+`}6Db@zh#6;XGT zaN?89nc1gk1(mTu6R(mj=($PgytNKqwHZM0HuZC&js9f*7s zsjF&SI^8;^eFx6~KHt%}8K6VglOo`=3N&}Z^7CK0;9=aW5h|OCdjJ0pw||GH|MZRu z!W~Wz@-xM2P1X?f4$=jGL(&wJV2RG4pViE24h#%bE@wnPWjX~nDgLjU=)3y} zz%UvD`nh4?nz6S}RegI0uPI)@O5yMB{O(WC04@N(HoNKn26Xec@SI`~%x8S=S2q=H zv;SfXcrn6(9_rrw-j@fP1O5dxz7Ot*wF%Ze)OXf4Ihu8q=@}WSg@_K&S!&Ql4Uf@s z&Qbeu{+`TQO7cGfhW`F(WP*`LyLx+P)Pv7L>wb@~`r9Y{>ziSMaIErg*t1^#<7)id z!ejP20Ab~>Km1nrTRQsp&-d3){Oz5bJ1`E-mW{=K*H-!6!~O54{)Zv`)A;}I2>ypn5Hj8@ zrTmtB>R7RzHWf<~g>2XP%rgJo zA?X!)hwP@9iZ_fhbGbMs3e?XRnf$VZ%#OEZ+Do~n^JQnm<#khn6$#}wva+&hh9 zFYlbIH7rXsY$dsWbkbQsbJ~-ytvpRK#JU`AgvcFWz9559!i)Kodk{-t==K?qJ+dz^ohi>cTyf&CV=sFD7Dw zKcs0-Lu5fT7W`{Z@r+hU?lo-3{RgJ?qNmRuH7pVuol55`S8X5kB!Bt^t%s`dUi_Sn zHW=2L4Wp#LsXm7bd%|Vte_iMP{fyoHy_f{E8FhCyI+(~sgX_6woUJi6{644l4<7dS zN`pvq2lgx{v0E-%{2T|HQ2YJ;mbvi|Ct6z+7SNf0tMDhXb5;r2b;UpP=)e2)SHdD;8Xo?-TT)%ohCU)rarT zmqTtHog)}$UeY$W>3O|6-(q*ERUE;c*$NbW;nElgXk&3{h zQZB-g=kN7DC^N|xAdyDSV0z#uoVam9qsua;1k81lCM=u2QRb$g{o37emeYnfb$K0~ zHdreCeD(c91dq0{Vdmf9F`MH7E`44@Mf}e(`Ng8oJqt(no!RLj8#_`K48gY*e5LWv zU1cWZ(IbE=hIP55A|iQkvi9vu$OxrNWBT4?V?gfZ4Mn_>@Zi}@aRBUzBv(Z$e|M&A z1NPpfLsVRa$rP7ry_^4{h6=lbFX^tzNV5w^{cujzi@Fbl$e)gFf+Fmn?nOS8$L$HZ zGWX(8M;;KUy6fTkT&we0W~l0CVJ_2!pYK(%VoVn(sr7q4%ps)h%Lz#)(789LNSoO= zLEgzWbvuPRxJ8jE0c5r$Bat0@XQLqO)TYf+-+^xhDQ_yX z!}LX-K!{EV*kVkVy<+3AMdnM1h~{s15(|BUl%g#?!LVZYb)gL(DJVe{zfNbgdFiHq zEX-Aug|pa+^oyL4fauFl#JRCby+>*fS|-B}%l5(~r>Up7?X=7YDoqXN$}J}wV@Gr1 z$^OTd%*D@1(1iy_)Ka6ErkQ&ItA%xcNJ>0NLSK^f)8Z`un~Xrl1k(a%BI5$QWuL?c z!sD;C59Ur7GdVq8U1nY4>^QBT*`=F)F$%gi9n14s{ADmc7+qk>-h1bWb{Jarig$hT zFu{mZ>eVrX#3B&^zp~fn$<@(~iYnXdU&N}#CK354pZ(mO_PMrB{q^)W#>qw0n_}m2 zi8nUYo;WVx+t1L3Fx5BKCc9Us^#RR5;G{OuCC%Up!4U>sVapz_w7tQvEkX>@7X@*X z$ES^()t^DM88l)DME;w=ovi~-3Pna8H0eLD@k?wk9G+<|dj}|c)8u)-#JDPZ4fD2v zs=-0e{jV+S89V6)gYfB>knaS;7}@;oo*1l*vy;Lq36=L{N4|CPmi8PJZA^(s?Dg4# z4u=}awCq6^q9sly|5kJJFT;k)Hha}FL|Cu^o&TiBM76^pv5R@|^oCfF&y2)jW#auI zxymu_D&gn%gF*}4mFgt&k0G5&O?=`J1{O)B|qQ1bb#uN*$7*4fvCBBTl4%*;8zT_OX zyO@JDtiI*=*xV8VeKUA);r$b#6<1#4*`Sl3m~P@=38SGI+?L(WB;*B?jnBSzNGreL zwEsd~kn6@0S$DZuQ7B895`JrYiL~1?6)2y+7JbD0+^_NZMw4?{gZGz&SKTHG(t&K| z6}?!oqA=G04X#e|-`zR?l>lWoB6ur1=k4EE4gNKBg^QB%aJ9XjiSxYog9Soj!B{wH4}&>QGpS*=17U7yo>i~6r+g+stpPsw$dQA$U&}) zI4D3^E_~?C=13y&t69|Vxb+Doge$Vn7gcS;$YmH_sI^@TYPFctQ|<_3m_L4nSCn?& zgM;}lmh)*nbff1Lm?j?Ycm+$8g1I*0sy*NoEQd^!Z4lG>YaViS&r+7N>|jN)4Q_=e z*6Hu&v2Dq+g zd2Y-Nsxjjs_CWsc@h2wvUhPJKh;jS#)pI2(MtzLT$HWf0CFct_G<5J<;qSkW<$u7= zSV882C+qRnjjQ*W?g}UIDDykzTCmiXokh@H54cim+<`m;^E z4&|)OTyPuX(^I|qhosNsO(&Z2@{@6i3#E6`<^**jW=c!a&MQ&ROF1#b(Y_yJOiMS& zL(<;vYUgjn4iwB7Yl%#=jhnZ@d(XF(lMGZ<*uX7e!joZPs{KTPJop;!%y{gQbewQB?p+$)b_d=2e1 zAP&z4_6UUb2MthntCC&bJ3eD>cout#t#v-l%hzw64w_7%KE?hA)mhOa`HlIB88tt_ zN#tv282EQ5Q8fQ%pk~KF+tBY-K>xMCVeoJm;n{oCb7^8Ob*3&QiSJS^NZW&B1U3)G zc#3)lyvg$}&IYw;w^Rl8-(w_`y5fH#W9B{YhR&Z2ds#0I&Fx+77QKw3hEu`Bpjtxr zBUtQ*Uir+i$F(y6&7PU@jo$AYB&MwX>|v^HFGb&3MAO}4Y8yOHv|P_xX^S4&KN3Jg zm(QpZ*Wcw!B>S3Ke_60`)21gj;`uC3i}XEye#p`(%@j8}hl8iy!g^?|p)b79QvP6P zFXZ-D+9ro@Jj`HfqpIFak%5R$J)~}punsc!V}9e?9BI$nL7Rl{Fw#n|?3VKSLi6Gqor=cL%30GORVEP$e5qY)MhJA4z%w~b57W%Ns*0|;KdDl(#KHvMkL|D{s5 z$917dz3F)CLaxF$i*Te(*{`=(DR7XpUkPC*qGV_?^OgG<|w zsk%;8#6e~)L~P0|YsRwuG)5t4}a#EqAtgq5}q3M?uz(>>eL6z1F1 z@sqMf3H$Vgt9^)JKeWPV4k)2xeXrOaYoQmaW@E4v<$5e6=x=u)oi}qVX9tDLc)TN= zO{i+=S6zqLv&YK}%=%Hu*E5>X&mBEqn@fx3H0wRhf7g#Mw|lhxAXJVv z^Ba$@1W5WOm6Tc42Z4^0nOglz49??UjOaR(Y59m8UKl|_`t3ifR&b~@#&Y&NS1EMu zSSDi*X9i946{5FxR3|XDj-_{2=3ZJiO3>H5m|rXA`}3_Tj2{abxJnBFydM>S{ze1M zMhdI-(yZJka2WvQ)eXe=D1iW@T|Q3XhTiLjU;XYW@E0PfUM^6iFH_+af`S9{R+h1+Cu3aSBKIO(jSP zIN}W0yX{uxqh3Ey?$JQOjNW3Uz%>1|nBA!=!s&OhwV)jkkB*-J8Tm z(@VqkE?k}GXhn7h(`@S{q07?+t^A*vN47^F`?a$4)(^X1)^rMr);!QNstzisSD!Ry zJDM3Jgx|`)m9|r6n$F)YmYUq5T zP?Hw;NylCR1oB8~q`5r3%=Uak-4&n}#Q^v9ovrCuW}}|r42g^s(sSH*^7c>Oy#Dk2 zD`3NA+-DH1U>+`Lc-?Ta2{l0YM4b85{0N6CYIo9rEn@Ee;H%6G$}ZXv4nyB`J=cTG zd{&mNQMr(6>*BQTB9dED+AMn8NHUvMyDaCxh(%dFhi)i;bqvNB~# zKhv<|qH(xF0eZ*&Q@t!%vTPAO+Xp|lT+v=zUrKy7Skh zhX&6J7_^B49#)2Wl|^_=4CRD|O)#PTpq#cmET&ycgCx$Yf!x85?&ispT|Gaz4qJiQo;7WV25&x@B_&EL!lkB9#|j9j*NVL1 z{uIMxp4ST>q^kTCQsw`$N{4!QHfNrXuPBY@-Di$n7WWR zHuX@)DWQ)G)E~s{k4uFx>uz!lvE(V3zvF{_PqH&IRAz!hH}ECB-N$-oJcVitL@5y6i2tqg17~11k%a zP>;Eos2vYfa?2ezDNuX5TB3`ak(w&^;4MtjUSC^~lpvRutC&bBW6lmpAvt@%2~d<0RqnZB4(+WEd>K zN)N7!5+w>Q9qhN&rjm&WC(~DIBn-NfmbK*5PmeuZ3MgerO*>x3vP1KV+on=TT-p+< zvLRaLWitzP3~@yZ_?I}%eR zE&3DxJ^Y=2LT-E+DrA`x56?um_pb zU&)Vxu|bBi8ssN>h?VaR&*y81iCZpfOqk!ly`1$Qs3|xhn@DzZf5@0_QW82;eDcOT(*j_gzitPzdVS(&f0uzkXU=HnkAZr z>Gq<)ly->DXI4GHpkS8J-Yy^qc-bb%_nm$rHcuC}+{c}d!|c+&5|Gdw*u$Mv$~lm* zy=_q3ay?<@b0|gGsy*$!x#hhL)g@^mIgw>=Y_w@Y#EyL7NQ;wn^M$hwPJLu*_^@w& zxLWwbL3rgB?giyzq*Axum1q zgRToX<%er4tdqJmyS3{2ImQNI98ERD992*4@J`B(S9_Xf4dS^whwBGZUnd(K;k$QDk7ZK0aJGmK>f$*dR8QZft;P5mw zdKe`UzgJy+#69MBVo}h4`OYUUu3Rj_A`P)?t>`M$7H_JkxGlirIbijr%aW?gXO&&} zT@5X;FNvpLb0$uSNIf>|&1q`mLfKivo4>6Zc6&Vzs*af<_x>>SGCU6|&8Eoj{0^L| z0llQYmAJH(i7F9vfWsprU&O}7mO36TKZ4iM{r#Nt-vi9kM@AsyE|cIR(jE|29l7^n=?Ye#%drz?yx}wvuD# zIBf)8`_q{Nv@+|7ofKaAhY!cTEcE_IJ4I)=0jc zSAdKar5)o*?u`!DyT|!%IW9I{c+u782QjA<$;F+ivaBCV4np|(nY|NX$Fe_K8-kMv zj3zEp?=@tQ^s_WQ-?Ky2-*y_(aoOguh^yZS(T&A9i_kZLZJ1^;hl;B~Eq29JGt# zWg@-0v>5td#~qL55Eywouy}S}NBm?NQXQ^8oZnkGX&iWd5&hp6l!yNdyHK%JMXw@dNidCdk>46dZ$b>K- zHq?lx{w$?0`9M1~U_R#Oi*5w!+WphnRZi;u%UAfvHJO?Cg8bTg9vpGnq$AqrlaTrN z3UxLHm6ZhlJmcaBzT>a+T&L*R@J)GVemgjoJj3oE9_D*Kp4wqDII>&_nCdG@VlYJi z@fHNF-;XOM@+%dYUrj|70QH!9W3Z?2;l4P~e#jIU?Cz%CVg5<-hmgsif`x4C*+z-p zM0aTqMN^Bi7|BpIz`3YAKU~N@`^B8M88f%g2n#S%J@C1;-?%!a{Ly=18Pol};bB6b z7%bJ$qohN#%DjUZUemnT|FTifE1#*7kWcANqtPz5jq|jb|8fXrG``8e$GP1rXH6g8 z^^nV9v!L}RZTQEP^jaGb`!hRjDrsr1!Ay?4)iTQY$9Niy6 zxvxTuKYpuoz1+C%N(~tCaUYy^u_YJ?-S=pl!WpRCk!2osb6p_tI%}sHTBP(CI{W6~ z(*F74>2=SUwCt7Br)v!l-Gsw2rK}g&3b3{%MuLjr@>De~5OT{F2yR zJv4n`2}wy>WT9HL&!0bUy=AU8+i3t0wlzNji?BPaCa`Q%u58&1{{0Lik;*_GFgcfh z{nd;k$fX3%@XJqX#-Txo0Tw=nL5LH^q~5UJ%_*g$O8wqKRAF_LsmnJ>8Yz%ETgYs# zDyfW46S4-UQ)pmAn$y1haEg^sw`x4K3zLK5uC5f5i&~Y^FjX34(XC+uVbeG|BO>a^ zwXx@b!(yje0p^(;A3(&Ufp__P+gOp+3;EAV-xa=>2-_ zZ(ylp(N(}A#N!~SL(rUQKJu2if6W~As?gs2A?~)jhq})X$gI*`&`LZb|2h=Whg!*>pG)qml=)guRD9 z7RO4!g~p<-5ry?1EBV(E{%gOj!u@g5z+*+`@%x$|Diun1EdFWd{liNcY6K*i+$Rk1 zL4QhF{`1Qe=SLVR3Ow4%Nss=pW3%PpIzxz$^Cvu@HC@PXNS~e@cvxI*ka7KG(0&_3 zeiVddUB-v7en`M+S=@^UN9aQRCGIz;6PCGLD&l8~Fd><~7=31g14)gzxcq&C0 zZ)WHVdx@AfRTl|X{gE=Rpi!);Gq+5_sgfcEx0)JPggDL7Hk5j!8$*s#o?oZ=ntW3D z_EJ&R?@;wuNSdgVg&Ar7;d@1jdfv;xdTk}of*>a+*T=}g!7=nK(7sU$yKVwCVOHyKougtYJXt=wWb<%ZF(uU_E8$E5=pPpL=0)D#Cq_3eu9P9$4q)eBoQ z9B3(I@J2UdS3Ml>C`ohzZ^xYsS37`!pxm51U~Z)UkxgbH{g}ACh#mVG?kW*Ee~yCn znuASGY+~Oce!>$Pelcx5R2h?>sUE~#n|%s z{TKejlI4#?FcEEd!?PRDi>lpRm6u9lrJfD>oQEGTu#$P8<@cg(W8N)0 z2ID2g6tY5ycXvXD#N25=dWM!2wC^;P&%bIds+NI>MoMQw*;gPJKRq3W^Pn z<~5wYOztxWL*W#0Bn`LIu>npymqNg*nNBcWPX{chAj4dMGG$b?sk8{zdKm~rTYG>$ z`);3Gsd)O%7rUki9ud1*_sySV0Ou-D!#nJ>k+^}U?K5$Ao4+GSI9YCY0KQQ#5K&J! zE38zEa<5eqKS2nWW|-!+J|QW4CJqCpvX@h?(es0e6L}8yxKqY0upl)K7G3Ub^$(? zU_*I*t(5JLHW6<9y@fW=@i zma9m1X{?ui-HPTI{x~biITz@7Nmp8e!Va5R7*Gm}H5%86030N|G(2WQ$9L2P5|^;O z;2N}LGzDXjd$)IXy8NJrp8l-ZzF<)gMdVpsUF`x9iTj#^n#2nztHpT%EtoyzgP)%0 zs8XeCfD9T5KBXt=$JK8^i9w0+;&Qd+AxR{l^osy441OL&(y!p^%Jm$Gtn?efZg2K) zd(5Fgcs_Ok*&gn?;RfhveH$q+XTjrfE^m0DtZ5f@d{aPkozS5?Breya{JXSGM~|pF1}j>u#MErMmVo zS?Ug=j9kkgGn0-6lkOStUj>k=~w=+istTg5Plv(=GbKUzz!YqncXT znpd0JDwyCi;@6HU6LSB`-RQ)K7?%rn{GXf{{PLZ%+Ql1TFHSAUQ0vmWOP)Wl{^%C6 zn^evhNmNt6$PgBXh669Pkn`tuG1-cTF%7q$}2#Hea4;X>-i`W~dk2mCr$;W%#nSan7S?!yw(zL$T> zjeh@wt=vf|-SnJfQKXz$HD$UUOeZv~yL8K~hC3P*iALI8sM#)n)Tiqlr&)ccQ0 z9?8%k%L;BNY*f1SKB}@UL!~R+v&>u1Zoz;0H08-}CoQ~^uas>N1D%NdF6~e*wu`># zw6O0j&lWW7yhU(|*|MyzdvxU8S$K8(MH0-<>r6|oX#J&QwuSZ56>?&n*fm>s=~_kX zK*N@vb36)P?V64&X|Wi5q8VhgDuacqsm{#V&W~`aae~byB-4Ymnd+?W6l;M@co;5O z$YF>ToW3vB7vlC<*olXwnA}&dFB2yV7(GSc^VqH-wKhdc;qYgj58+}T!HA+Oq{02# zP&y7}pL9FLU`1l&#exw&V%2_vEMHykaz3S%5YVV zbhxwi*QVX4kLxf7l$-TlXH8$kjW2{8OvVCiv|kJ{d-_pbIk3AL)A=5v5i*{cyi5UX zNBZn!M1~rl2$q#$5U(ZgMegOOr=4_KR+Aha5xt;Mv@nPWp7T;N0;S$E;c=D=Qs7P zus^Yl;=0rA{XMa4tLK_V`Io+GE(Vq(DBP8sl`a(2aXFNhEr!Qs4v>%UzAyinta7be zeaJ)x$ss}dJaiUo_=K}Gk1J5|;FJx(TS-HXdS#nct!p~}izW^M&s>3Br4 zsm1iRnX|4W3Ux`Pmkl{ed#Sf*lDi|b$Wol{C*QjHqmTR1Q{Bqjt03g&9vrQsdwec% zsy*e#7)n0;o{HJS#{2{$`GbYAsb)j(4a}j>))6T8RO=N>;bSvKm4kS-<8N`RlT-%n zUw0bfo__R>uzjz&0_7kOo|x4)b{TBwNUZi5W|0Z>{$o1u?-c*HjG+XV>SCM?X47f* z4%Pb{kiZho?-U`Js31NT*X#3IW}sUsm8((Y1> zSV-G)6SZlw>`3eCrh?sR$E_|tG6dTj5`8pg0h@@+s;0f!w}v4$#stE#8n58 znX2N&dGhz%1XEbLn%yJ4E2O zVe;~*m7IlIPH;_JMO{F|UQSx{YQs-`*0?Wd)qUu-w;fojmYLLrlHtX>b9-jm&G%K2 z<}zY!c=_VKVNeaC7u_TaSInHa`I!U3i|e;PDCRQzllMe7e)6h;gP%p@Qu@c!;F@ie ze`^6`eajuFI$VQTJF0&mjbvMw(x%*l@{U|VMeE9|^c$5=YK-w6?dk9*E8KULGYfB5 z2dK7JSa9UK9Uf$i1z2O~z8=2`UZVj*>5t(d8L@$Y!MBS- zkQ2nrPfdfbuLG|In%-+5`ToG7(qou0-27Dh9H|Z2`3z)w1}#CXNf{!$WgNvt-cDH zj!{=vtthW+6_>xg)_Wf|e0OcYFIOE4MAa$#O+E6?Ghkg|ff<_e&b)l49;C;xn%rsG zGRmn`FAmAUTtd~zSv%=|0YI5b&FMS=|V zN;|s8$9~4sb!j;~q=d{s`56aF+8(T1NAP^4ge50D1Zq8XAN8+4Cn`V+7NT=Ntn6ZL;<2O7p0(?Zc5|)qR5=uH| z4j;oSiO?oDH)Mh=gpDN9yv!3kJ9h``Z$sg0$L5lJgo*)NKwjKofYofaDi%B z>mH{qSH}7AMla|{%~pNN7nx>Q)xl)JdGe$SD8@?6mX*^}5Kr2rag|8|t)T>vJBKv6 z5wQeC(c!;}j*2pn=6EO8d`-;fe3E^&YsAN#j7M|>M8lZMj9plI!Hs^cyVjs{+>HND zkdtF?BrEh}XJQb60V4F<6kNW_;JE?a`ZMm8?hDg<3qZhkt46X)WVB$g+!$7pS6JIK zHC-Z~C3U9=+zSc_YfJtlY-b7`9w)J8&8fWpZlTjgL*~f;hqJd1s50Nahb2`6MN&dQ zX(@#R(kUX{2vXABDUDcwiga`6l8`P518I;Dq>*lv6b|v*#~a^!=iWQc`_4ZzI>tHA z^NGFJT6-<2==?sefe|*{L?Q18ecg5eebXWTj9BTs@!}pl+h|gUr#EG&R0jA7J6{dt zJYE=iS6FA(VZ(!kk|90ndNmVB_sp-blT3?JIAY)b@hH}aI8o0jGyb*%$MEW^=u)ZX zD&_FOs`EDj$!n!Gp{10@xEfx>GkCQ6K_e}rYc)$=G=+A&QQz;0$wuvzvs+x=1! zb+;L zSb~oQ*N(<|v^1C3S7qJ1_(IjnA4Vb6RdQ3$TZkI&($^Kzry3YXZy{xedOcOBnBjs9S?1-yH*Zgz>AG}} zD`Rm(KBJ-e! z?-M?IDgON?eJ@2--*Y0)5AxN!su{NhbfRBU3tX+Oh{ol)ETmN}EdvJSU1j@NQRPlx zvB>SUZKU9CdBrSbH1_M=Y8_Mght`0B?-9n;gaE2azX-Z$%&u?Up3 zY^UC}tIAtfh5<@G${FwA#&dX^Bk0>_4T6VE@mRE8pqVm;x?O2Yau{e=1_1j+%zajbe zQ@}+x7hZLJZ)bYB$gAjFcdQNn+Qpxgmz*3s?1zu^jlO3l*WGt8QfcoMk$%U@QFNJ( zlHbh}MbnmS5ViD1$w$9`4a}fVd@CKSvWZhQ8l_d;<9|VDgH_iqpVPyTs~!Iu7%kN9 zuy1x|M1-cbnk4^gDpNff`I)JbK(}LWd7^T4)JJc96xr;P^P2)*xsLKGM+HASCYn%p zv0k_eRj!qO*0Y~&H=RA!TWaPX(sCDKBDJGkpdCt~9WEKtSouh9&PG69prU@F!b0os00WP-=T2_w&UMz9kzI6K$L-jpbFk4NJSWJm#HmVv{lm zRGcY&q9GPFo@^Yi5VSq!iTvX)6|fS(6Z#UjxUA5vk)E5rdCKw;`%ILaSvqmJ?--cK z>{o8MnBTXdk$Zdw2+-}h!Xv#!t^4c7LE?uxR%3g&UauP{e|RFT+g&{>Uk1B0bl_VX zGxB-)>!qAg2J7rho^F167M?M{Xl4N>bM-rkvP!??F*v2_{{XDYl9)Vhvv{1+2k=k$ z4r$(__7u}tnl5%ZR6|es2f>z1Q4Ym`zVJ?wQVaXVQz9Nu(yBkcUpuaRiClP|M%vY# zDb3Ro<Xz=y>t(!Om0oY+^j4loQe+aB6&pP}-ly;; zc6?&3>B-boFMXh`CP5v^?(S)>;Tuwyx0DNbG+sqb(8iK@t#qrH)EI6_jfPB|OIy$S zv$xeQlL|-eFh}m858tUzms2BCQr5@FuJPx=?0WZ3q~><0|52|qI-MTZ+45Bc?zw6# zdT2XGdc>3vJsC^sD}4cl({cs|DW;~Tak{Q``S3c)JcNeYW$%A`|GwaiyJy#_O3U7> z-q*k_*{$^Z*S%VoAGhYf3Z5^}t5Tn5%rd6rjvP|-u`aijp+`*WuLT}1?4om^=k`U@ z`A0Lk5Ry?y%g?XDa%z&w;q4Xhr>p9Up8pB9R0fp1*G57Xz{NNM=NuV!p~F&NmNS_6 z3#pqt!P7c0|TkoyQ(vE zu?NoqOk0p;aJzRyY>#p+K|YeVb$n7^Ysw|PF_)wV43HBQ$T8E1Y(hV zAw231N}BtwlW+Q3L{gI8m}{xg2iyg(%=zw98du3auk9eTJ7d6^e^jgMM+jSyV>h!x z+|7KXD=+VNux3X5_TIFbxF8Bg}8~D|Q3!Hl?kp)>i&fxgs{V)=dyK=$*XAn9G zezR`!s7UW&BxmEHb7Y|`51&;)RGnpNM&&3HdeJ&AA-C}-MPSg?9cCX+k?*rV7dzek zJSrk$+7{y{Kn~uuU-8{Wv^5VN(4!E5E-jrJz_9j`nO~a+AI)X~;moIS1 zaOf7gHVw*d4sPNM;TpfB=gcGDaXqLF;{DYD2!s|JM<2aD$AiUp2D?pTP>qZ5nL1Yn zOWdcTGNXm}C!8YX6^p*M#nHRD!=G$rX-2xWl1+S`k}yv#zFKja86(V{x;QHmP}aY9#%+2kU~n+q#I(+|(!iZjSYyTszkuo6+}|XOyvVXvUJL*2zZx zk4gIjCPXNnnuL?@8xV1j4EYK)x4SG&Lxt#{d#})TouA=E`*~GY;X51l*P36e+3u6i z0zM{1-fpD)Vi%@qsF=4}=FzHW34cs;bA1E!#_|2oG|4*Y&Y78+1t*s|&3sB>XLWyz zhLSrq#14AHY>8W(3DF=V&oa2Rb5QvkYE1a%)&RxxB#-9b8*~%LbS*U2gC-`N`xS2nqaJjacoAH!15UoH z2U;p&b%a&l$C~WDpT*wxoZo^^d_5ovHgTb1gY@$4vf2GfF3l{%vlQ31DwJ%_OnX2>}n0xlce)m1)0xRuddNjv$lN@ zyi-4rzV>ms%9g#*f%e+jH9UXhk`F4z-0OB2n%7#;aU~UK6Zt{&IiM!9f#cd*zV0|e zdpYT=vkLj}eD}BRdu2HV&dcFiuA$ZSljP?63X7i=#w4WNutq1B7Cyx2s}Ft6U$CrF zFVM`XjMTB6-2AMw{}Q`2dM%}_cUyURSdY7Cb%8=&naG>>X3y}Zg|v%Jb_V@0sAsjyS9Y~yqQ!DRKxw3sc$64KU_=rlX z?BwU)&cl=1jupMKS0a#f*&Sms$Ez$IhC*jjh#mj9&=cc$%=_4@M=RRz*dp)w0%Kc> z>5yF+%i4j%V!&x{L#r_FSX*9QvCD^PEb{WXH+DNmsn5t+u?7dyJ;mjqly7<3y*1{3 zxOiij&%P+9kexZ2ks{*T|hG9ph7B@`fmV@7ZbE{ zwWEE>@I!Z7N0!l4St_!_Jh0$4tA$65i45k+8l~Y-jb@eP!2N6$X&yEmd2e5jXKUlD z6$KZ_)rzg8tb|i5_65AIPwKh%)HrAw#jxc}%E)K!^eFS>cnxr@0~=39@cf#U zm0`u9m}jHEz|(*7Amu9!M|vf?*9pZ8`0?!efwvG;4#WA9|^+}9#5h#g+H z)K4(FRrd0$F^XEi`PQc}T8C1nMc%IVZ=4_pOj&z^Xl)~__W)~3Zop|qXVDE3zRIVN zE^9dcuev#f3E2#?>(sjD<(kQcn0F1AnuWT&ZXjEqVi-?x_>y00LFCAkN~9(p^Z~6r zbL9{dGjwI+AmBWA_Yg^ZQd5U)h-t=2vKz1cVrsu>(O6<~3*9J{t#EA2fVW$_cL?_j z!@fxo%2ge`eV`uLM=NnoQ!IKfLC8xiV9HNc4KxoP>-jKR*}5C;so|VmEbX8VjWBl} z{?Wr&>LL3b)uddp)1{OCj^Q{xkc`SOh3RS4>^_&kLW{B~Mma&l3I3a|c~4K20kPh+ zm22{$NX1b z(y|?HNcW(cQ@&=rE;D~HQs(*EDpJ}4se zW%xh^%g;9T6>g82-y4-uW^%h6N`!do6$g7GEr$B+cVeK|S($D>C^3{(q7zqb8A{(8 z7T7s#I-rIQK=nLy@+GcG>qyqY;gnGmy+}{usN98` zNw1d5J;#5S_x<~M@%79=duuDpx4RU*`mb&As|{ugs${lPHgRc1|;$ zMEeV*p2Z3H+I1cqWCU7i9}Sw!)%#Kzj%=f4dV2zoGl7Z)=O;zS$+r)-q*`RUFPIv#;(g*++SWNaXizh~kvExN@)3u7 zY1PocM@3o|$u{6y`$A98`R#y}wz750m+%ilyY1oP-_@-C#%=crt4mLgemkMEsyyto z>#lskHa;1m@lgY1g5~1Y?75$O9zJNDMPh-TA0J z*YrTCoxaFSt+jiB_?D;V?#!n|cI#B>NaanwRi%5wrHp9GXJB0GGw7S$KF`!-)#bn( zn|W^Ue`$C?CS-@xE~KgdE=mnNvsrC6D~x@y~tyc1$yOd=axZph#wJ(9!D8Qz_D zgg6(|^++?uX;GIiP}_8_w?{%p?A+fvhF$B=qg>bRMiBn$!QHRr8i_;{@Ez`#C8sXQOvt>QQ1mvCeW^PO^22o*Ec$wYww>LqW{G9#9eW$ zH3HOJmyE>{%(-g6zduKx;0ZF)sv4BE8-0M;6g(GX5Ofj ziS)w<#{b>^8(&iq$}YR16OU~T+S!2EB<@iNjC@#ZBQQ=^hslRZENed!=M^t zU*WU%S~D%T8?yrc*dT#&%#PDycf)93i=K?%TY9cT_$TMAMlM=?05MB?kKe}(&0$Fg*1ECT8S=07_aB~F3DNz~Ds5Sw3xDz#{rgJdRRIMQ6l7bifB2*R z_YXhH!EgEh_?OTX;flJv@SF{&myrXdnkk&~;22m*`J+>q0rSkq+xI~kHNV&KF__qF z;_ZWy_$Zxgg-vL$HPA8O-=%O#ScJc)ENm8XaGAa z8qR2o4YKAWUG*4qddk4eoS0{bPP`Cwvo45Loml`bDe;DOjkDQfZG^*w-of66!#q^4 zZVH%DObT&yXV0G9gwDMcjAE(2EWF%72@Vz;vd{Mm2zsnjZ`YELkN^smDT(!)ht8`G z&n;oUc5C2Fl=l6vZ+=Y^$wLB4j`C}*Lg9s^s77(n5B=&(2PTLfq4HT|74jaE$o7;1IycEjaXkvgIEe(kI|aejikXrj9^0VcZc z0Jtl?px5A2V>?{hJVE2S3bS>KTDoZeUvxYeTGaz(jl|wQ)pm`meX`7u?`-Qu0nhKL zOO4iFuAQtU4XFfzf%KN27dY;|XaeZLV4VnlXH+O>?VW9x&zZ8sTN494T`-_$2B9L% z$%bO1v3kzIXpse)u}FAe0n1?%ApdsAihjQvRz3-GI+xy_v~IL=d}aq7#;ByE^jQxW zrq>J%46Setn!@quJZu$aFBJVn|ik`wUlA3RZjfmjRS8>NS-ub+x2=t>+tr+ZJy@d44FA){{!`b?9e%s zAZ4*3Qq*_$;W^G%@az*NhYtJyDiX2DkU&K7w&9p&%eY=C&h1(iN7U7IiY zj0epPxSOtvpWki0x49o?-tTDfghJT!jVsg+E{5Pee^&gn_@fJ=Kbu?>l@YDtSg%~! z2_9{dLtdUaR_}dQUBq{BR91F&2=0wx+bf`Ae~Pd~Zs{j%bO3=&mk*pn(X8s()m-YuX_#<8$S5F%#|Cu|hJgsk)q5}(?F>n&kayA@Z~5u*lINC<{4?j?km=`)Q<1_!$%o7q`9TPk%ujA>9b3 zH&63RmnvO8ff*@YiG+JY={?yigH3VTB);{$;)?RQbbX}-S7J6+N*nwR{AfX+wcxa~ z=W}v=WCrR>XZQr(23w__cL#fr7U4<1+q9=JIRmX}Q%#YYjf<6|2h^fIUro>TbxIs$ zU|>~RGBE&4XRh71Dw*&=mgmOO5qQ>Cv}qWlZp~OWJ{w}coiHwil>>UuuFsT*vpr*( zI^4GK8_0X{6>3HGbIkqm?%u1iUpuP=*?>GchRZ|64iBplb)lp5WBldxDm0jU;>u+- zR-b6PNHV0$Qmg`=<5D(2Tn*EyR-SS3rlKv@4g4y1Tqt371+m~b;FVf#` z4Z0w@h@UasgP@NvSMBSxaa$g^5qKLw9fEia#f|=Sidck6ydNj;8F|#v@m^!>R<}?O zg>I0`d20Uq-8)-WLF^U9J0&+)1jiWr++ZOO0_{8Le}Ix%7(AueHBTJs3<)FxKwEn> zx-4rJs>KW`V=X}E!*;hQ3aticR6W>}II(}b+0Mp99P7r~;bXRcIJ__L&rAD1&&R)C z-D%j?3IS?Rx^~6bC-mOJI&0e-fJ1QpjWkx2Z7;jzx~&Y_I+EKTcORhC~9v9kGtuoj9SD_zBx8WPdW(g z#YY`7M))-z&T%Ch(ReLhf%W~;MbxyF5N#3OSz~Tf3q*F3&3ZdCaQ^kHeY!ALYChx2O7%2GIvUk-Sj*usL$|=G+XrE{G3wxkz7+4j)cmA^A?Acen z><$SO`ENtblgb)r5J7sHoMsP>5SIqYx&|xI@U1oGjw{tIg17y2*=XF}#w@;=9VupE z^!v*$kPx#%s$6rFiQuZ^Y38LMYqE&uQ1V2JK`!sL7^tl7%5>#ML`Ke7xgft5D2&X5lz?yTwT=bL8I4CxARHm( z=9e9sKL7lO#gF|i2frE9;{%{-gC0hfpTK}HbB`#hKr5L@wvqN|QHCk;)%kRZE7027 z^*1#p3wc;;!X<0*GVQoTLSc6VP6IAVy)0=S+UN>sV$nXr@ds$aVEJDBO$UPkEFKZ! zB>oy$#mVz0N0TRR%e;BqwqLJDmM9~2s=l}YQ=cgGdH|vqP3*3@1aEKMoe!q?*USIE zmcse<;xF1#Gvl9V6&vWUp0lpf=}(lz*|4m$_sFD(TGiqoA$yUoTesf<8zhNo+@Ouj z<4wegyiL6YEZ_9_SCXXTeTr3q{pq7dT}ZLXG1_H=5yD&+jIJpjNOg#m&Vgm1^gUc3 z7!m@zFVt(57G9f>4>0UfSRHxipeBgeduN%EQromjx?HAf(=^oFY|Ejy+I!EQnA&Qi zq=kTrFOK6<#t#&1QR~!fjx=rr;C^>g?5?lX_X=9au#Gb*haS^(cXmD>{H$qB?NM2$ zEV|ENxt5@+T;<*0^AP25_w$R3h^XBy@^>Vz62EyKfs{-tCV&RMPLGTR-xYx>%Tp4$ z5#XJV88wD+CT*R~ZnSOCf=6hndi{Qu%9bUeT6OxUftAV=Ts2F=N9IePksqF3NbHoY zZN7jL7(%o-R#Vo7pP|b|cGq=O%A-WWgBe9#gR&Hf@LNAc<3SkgZ|hyOhr)_-c5zuV z>$#yY!Xsx%jgM>3pcZm3`1m1(onJ2K zMF<<*iji>RdzacTmU%$%hL41Z$GRR!y3CxU90>xQF9N_$o@7^u_$gVIVrBsbRb%?_ zQ|1Q#0TD;5)~AfD_=0a&Y56_Uye+sKz9&S0 z#Ib@zi^h2w3ZqB1VOl1Qen*`uK+P5QHa?Arh@kFsoZs~&S@vCU<&oH`_p3?fS zEBRmQjoh?fUvw;XZGA`|>#8l<6M1d4YE*q8ceKumRZ%>=UqFr&?(EaFKc6ZpC~V)v zNmfokw3Wk=BMC|n^UA)y)C))Hx>bW$#aW*{f8tD5J)dBc1fbMyTj*3Ict%b!p5PEn zeYU-m)x|(8D>m+>>Mv&5yEcB6b=}r({dm2{UIzqrop*+6pj4ApR`#eYbSYL|fsmZN zo6>8C0Czim@`9C|?4?*xeR90GVYL^79Q@E=XnP827dERM(__f6)eEEN|MIo|IT!PP zKW7~i;DO9c*jbfOUYVfe;N|6AtmXUqoNZl!dBh$o#`aWX!iexJloMqMRYpwOiLJfok$EWYqD)(j`5ROBFu|t(KX@{`89s4#m{NoZvM@*(lokJZk_ zHlpjdNsfz+tLKtD~)slDXgh9(MM`O>$5jcqI))+9(AIW!li)NQ<8@U0H%vlON zP2;$H4Dsh7Ijj+W5KCLZ^XUZTf6@crI(;;k;L`P`e(9jAcK1`wZ{TV2IZUh~s;v4y zTuPa5$x&8#CvNjG=xQBtUebyNVZH!O?_G9*tz)4_Yyr|al9`V1 zEzQ}Fs9Z`@1#^f=f3qVAbKNm3FavM4uubRLSAWCpr9HaZP|Y>dQCqoE+&E!6_c?&d zUqv;yWGD1KO1h@LX!Oh1PT7&{(R5#6A$Lm!_<+5_w(+D?PFf`Y$S3sJzHS>5EBULe zo)BHuEhG1ZxB#WP!rX8&e=!0%O6Za56<^Ydt(($1QDQ|yNOg!1eLK)#AMMX&?k#Dl zK~?PJagiV(LBJ(D=S5@!+xKR2!>Gk?8R0rIVp!{TM7)GO_Ga@%E2slVGRd;grxf0S z%r1T;Z+$LO#MghN^6@>ynVD{(4TKv?nc)aQ+z+M6+F9Iz8dsXV1=+IFr42gWY=!vg zg67H^I0Q!l4MWpAC9y#-K`B}Fs6991rYOW>ns?kD17^41`d{LTl8o zME*nd#p;G&R|>-D{iA0s*Q{E!8;Zwmu63>CNCcphpG7t(PEcrMXmcv)LT%3XfxjIV z=-okSYJRd_Q*3pg#@}%4>cDUI$g>yX1=FtYMHJCRkHl?8R!Ztr4E3kyFFV_V)Dgz; z*je~x9R^nA*Et^8S`g$r`VSWc-b$0JcK0qmImp+1ZTjgs1=DrejS`GYUPx4jp}*KT zjZ))OW%Ff)w7nvO`jP(0iI$7NqK)~V94YZ}C->ko!_}jVo}njJ$yrMi?Mb+CC+OTz zp>9r9Tsj8PZdsa6AVR#=@aj?y#^VGL>)qAiZb!jb)K=ymI;y(c%|m)&0AJwVm;|-W zU;!&1zwE})s!GDXtl}Zkp?;k+yhwfM2$a}@6T3ACm8F`66qoEXJA(Y=@~Sn^nsklV zYG$MM%vhh~a#zL7@ANC&sv2_+UEulV*Jv42H2pA=Bnz<>J-4;skAcnv?B$r>7MbBZ zcG=@y{+|uZ-~We-FObN%S)~oGwWRkXB_EoanaR4ly9Ylh^F_q5l1Z0|X6580@>&OI z%E|*@#K6khW=mcL7ow4FRYbd!;e&Io5XkPeyUJ=dybkVD2I*p`y^Wi?rB>!|BBJiV z4s}ilmc@F&XZw{7(ya*uqPtSdp)07_Xh3y}BoyAkx+;aE{Fpiu{{^FdjyIm~;`66}0;vA_ zqGrUoi}-R*X1abm8+K($QS-9&OX>+fj`plo)T}TgZT$%{Q|VMFa1cM!3otI#5+of* zRo?1T4noRNo|-dMu4b1%Mx{H^vqa7&cuNYHsYl{sx_b~2dWJb8Kli>1%; zRb+C?Zy)$&8~`_87GLXm1Rq-CVj%FGVST9*Sdkb-A?)#Hzi&OyM5zG^srJxG2#r>X zZ(~D#Jw^SyOMj-f{(HHRs)jdE6!qFz75y7|N%-AY;~wNg0de<3&P@3Kl_dZBzs_)o z<7R#4EJuGzRq{)`)*1sXrsBUlPtjG+wYpOjQP)sx9}oWbpZLuBNVT2HL`ldpY#K0gnH+Dm0SBU%;SlZ~9|Z z@QKLV#E@fUW$iAzwW(zdxkgKs{wIEzQCBevuJ|em%#qYUavOR^QTT^VaXk|hMg;rY zK-a!OZ@2_r8z%I%u-*ov$8#OA9bZdK7=?sJt@iG2iyVzk9&3$M+OY)(2X~wKhWVjq zw;1Q(8Xii*Nj(!U7nBZ&F7wIZg4i4kdu-{I zF^5-wZJ_?mJ@M;<=gPto`jUB9sIv?r>%UHL;HwbW`~4j_zjzk(*e%$dpgU87_HC}b zUx~PjkA#-ip_0T0bbY4Vi@g#ijlfyW*BZ2Tc1A%YdMot%Ss-w8h7>$`Nl8f+Dwogw}JR|-q8^t={6owq{l(gtD zy;3~)oN1L~{S&;!7>mkIq?jRi{9WS9cvY_)wvggogc=xNR!*^f3A{?muv{TnF)=0f zO*e-od(--Smtf|%0t|TbBe637P&s;b&%3+B%NZILA*sbm7j!)=Fa&+BLDzItc=#2# zIHTY}2=0OA8DIR*rqTYfe*-BQRMxPfYCSeMYe_X>nDG`*tz|$ni0(f$d<2y%89~z5 z8kZc1VY;%|fPsa32jbk#V7k_r=R%ExAViC10YNjLuSR0I?BM#Pd6`+{vtB=y1)9(g z%@Z;XcvIX9DT$cOUP#=F zOP;V)0vDVi=Pnb_xfDK;UOxXiT-+rx-SzZ$7a^A0^`^iM$g6NK(6EC}&u#rnh<#v| zu8|d8oPV;H{!l)KKINmVclzt6uK9`}#7zLPH^NGC#g}V7JLc4KSrsYF$k`dKG-tSwF$*!QMOr#x5(Qn2R%uo1!wW=;>L0#Dznig6+5?d_?<-A)F1 z?KE=Im&@(1?40}rH2>On>QvWehBOb6Rq9J~zP{(T6L50hhT`H4+ohhT6=gqtfd&!( z2$84)CsWdr+c{Q_1bx{uCM&Hg9JSq#9%1({)UFgo0x_KEek4;SKDV%t=(6Sz6ybV) zbaJ8~-sn)8hi6z6&faifk5CS01&2D-01xcKb_43DOv&xbLf={`5H?4t4I}p#`gyH) zDF~Y66ci9pYG_yED_l99QPPfw9z#hNtF-(?r;8|y;V|^wiAJByOxiM79*7UG%tyQ+ zg4wh=gmkx(`|^0>X&AAbe!c_a+;^bQnfv);X5J+adGUz-c7`Qk41rSL54X{3fQ{qC z#k!tn2g)WzubwrmVwq(shK*yTQYpr2j5gFgXJoOww zJG5r;ELs0T{}lc>=1aAD?4s&vysZcQr)axk%tCPqj_0`iZ|KsD@S)_PQ2ev9_{R*TrbeneslG=^$6(<{TOFVl3lnBQ z7yc2K-jg%xLtB!7i!gxDkL}YcAxqVQZ+wDX!!cswy6=JOLPH4tz0aL|?kgPQgWZi4 z<1)0qvjyr{dGASqpP9($u@AKmmdW$O1ZLvqD0OX6zG3B&-V~d?*-&l={ z^rYc>Pl>q}UX^s3MN?$bS9}|pW&C*EunNTAeL?Cw$;r;xWfsrxZ_QtVY z3PMO#nzT+Nv3sNY?W{qV{a&`s^RO_Fo1o1d=`wVEEufoT@i^vkn8_UB=N=?3=2DE+ zfoSOl`ts_wJ^gb&(LwOs7w@sTn~nP~;Ch?1s{QeaYHq})n0Eoq!HxN$*19Y1TwCvn zGIu=~?(v(qmHzoz>O%aL3XufqvS-6|4ZujhGktgfXLE)Cj>(mOyrn;>!c~^A^UYug z?x8*yKeQh`eiMIJi7y=20Y-!bl%{T1qmD4-zpb-{xsOk`$3S4+X#PUnFZlSq-D5ZI z3yEoMJS@zb22;@b-VD66D}g>3!$fxD+FISa|^o=B9G8vNz&0 z5s3rfjNIk^>KF3jr}UThG~5)`&=3B-3hP-N`4YH`M) zkWtLO>QR{n&bHW~r=txlxRU*WT_90QBtbYuxgjN5<$aY$60XBH6{MvX2xMugb^@C1o(d{4E| zqAX92zR!>5^mnSj^9!`G==~MBPO9};jShS{O6q|(Mjzf3-!e8f{%X@(J9S?4%uRDp z2PiU8007h76?cPvaS)dO>5ArBi6Fjz0gNrHI<6A7T}dOl-E<^6v=_+%&}OAI=Hn#d zyNn6%98g%NPDDZ8afmW1i!DfXZ+aE22rgvN~^K}WV29cj(k zL9#9VUry3LR?4Yr@X1dxj&tVel$qn+t5WGNU|csYt{Jt}$Fj8^Dzd9&E3@1E3dyCq zFWPU2h>TOTTrM}F0^r>4HKX8iZAV6R)km}fpCZ+?jNOWi1rL*v)baFYp?1mpC9?JU z$!aGPmma`vlfB2IV-5Xu0QR0oEDb&cOxC@7Jo@oD@>*pxaAzomemlnwpkSug5qy>^ zst-uUr$T~s;@$sdJvphUJ$xon- z5OmXBf;inm+{<4{23EdN94vizuQQ0&Cc?-EU}bjJ+nXlOiH9WF_9j@f`~O_X6V}CbXKYU?X_B(KUl#1#|i%H8-7tuy@$)PM1BNwE6h#V)s4Q(4zlCY;)8^uz`s(D|B zPZn1me!JlR3Qg?Nc(2|-izV!0pp@$Xw**$S9%$rnBNlDiqu0-Y1@*eyf~YT21V)p) z_n}rx^3u7co~__iD+a(E+N8n74`wJr3GNKAD?rY&`+8Y)U#(>$I>39gs_{ti_DgE3 zr#D)l!HTz=u%#c4EWnoeeN`2>OsB$8<=Ibe{_Y`;lH(&>S$s-F?pNDuyUhb@F{?2u zlqzXrl~6j|(bL;*bq!@D*vwVWdAT^KiJtiX$}#V%Jh*M9NWDmq!)PXN15p!F`pWf~TLg|@rKA=kUlX~jf!a&JfSfnAW<(1#LZY&1dr zG^yJEQpKwJ0;xlC_J<)#=CzFuCk=*#}dv>l=n$xY@{hI4&g*k#%r@cL1lp zvP8ZVj4NT~_zIWS+C94wh;#`l5!7m#gxv^Xu)L4YTT8lcSQmuv`yoscaUl|Ww0B(T z@}JdBf8Xta`Pa~QTJYS;XP(y1JGe~N zxhyCVlMq=KjyQnkPU0QY`i~IZ#tDzomGzR`yY_+5mGV2fTZHma_R8B zuFDz==kfI3dX`wM@SdDH-U~k&^KVSCd4oRqgE*H2AqCs0m#)#0^k{tg7#}P`UebtM zgT>-M3-}}PkSxVNyZ}N6U7!DNJFWlQUgBAVxujnO^R7Px`^w(bQ%AiDEWPbdh}JIx zPQnNQ-=(Fcx3KJ>gb0Gp`Qh0HRx9M44FE*rA#*gIj=t-e#&Bqbs`{RmTeREAyC>i>UiZrsBYmWwCCWeU17|vuSp*UQ^$0O#2AF-mp-%eDN4T=Cr#^Aj{R-+;h z-_l^DwafP@2{?COoaJUey0iVqv-!Ur-&3=gCC~fMRwOL0-6gZ$mg~!wkj*-)P}<>c z;Z$Nb=Qcox%qLkSmQUQ|wiw$aj{|AkqH?A*Fy}~Z(ym&v(1`eW)&r^E5!C2KmCAHR zV-yVd-iNmE`~|@8Z&m64KWq!MlaCFBWByFIM20nhqatK6QwbQp8emLgI>4|kQDim zLdgv(pw@M^{w{1Kt&PbSe4yzw{yN`Hx)61Y+K7c2r*^2!(NWiCP(FxOB`LpjX!jK9vQafIO5iT?qij>(2o`17vs# zV@zPM<`1`?|zL_{H;Tk3`SC zPs8}L6_EQ5pf%F1X+zgu>G|adMtM_c=bOjce(ALXOYzF>Mzsw5_RV-Yp35J*BfNdTi@>0!e!{aHWPp24O|#PD zbqSXi32oAlNJt#MrU^Don6MSU-3-}!OK#{>(xx(pQ2EV%Y+@7-h&UI7wwV0BVt7LN zh?1pg#Q@9Dm4@QazCC(r8Z)8sFA$nu%Smz$L7UxEA}s0tU>QfV2sb)fB5c*~ZHKmC z2L0@)y2~N>Aqp?--a>gS56HSs{2iVGkhu*;s?1vq0gph|79z)V|TT`?yB zwQn>`2AsXA3^NvyNU!C*J{ZR+lkP~9LTjRB;Qp9vV9bSv!2`|3Lfv!+xF{29SPGY2 z4a=#}bn8OJFradsNdi2Uc)lrKJIfSXuqSZzs|W;P2C(8*%Rb0eZFyRZvF#i7&1C_( z(7o(;v@_IKsC%=QgAgq3*l=)g=F7H$VthpX4U|ANi>Pckr%Qnq>fP%A>eyn!;ffG8 zqF)w$pa)daW0gm4aucSw(3F#&PWciheQ7%kU} z@5yLzAmh|qtQmu{lmY~tIf}|4bnxoilPSv;!AUMEUO1KC%p**sK?=(+S_FEn`rMqsZaeUIwu-Bz@EhwR~1>^gtG z5wRD4IWrS^+Pjga!K>!P$w6$5^)Xg|2`O^VU>Is2n@4!z+9-#UeO?&ca{FhC%l zokEbPL-x&%ytl_cRJ!A{1H7jep@4Agwv#3(x1@{2RsB+=KWf0;0?la$F#2W*2 zUm3~|WT6}|3-ue$lR(_t_9e|!T04$K8>jH4dEeL&nb-F)YJa){7&g_Vc_)GyypSCK0^FX(Bs4A zwR8+&hlc&^lNhyRU3q`zBX;ZSW`MXQ%To?`qLK53d#r0K$H1~n3%V{&QP7XImGy|O zyp37+zWU&D`(`c7B>YWYj4fC@WVAwgW^XYgY((7!|0c3twuzN92(~- zs23Y1yBm$0@p@s#8rDbX$E<2M$3L6YCj=Vgz_;Nz$|6b0DFp^!BFsgcPczTW&Bd>B z9L&}F-N=*RN|0{RN3h70-M8%Jn&zbsCF`or#JIIy3PQfUyzTYzQ80iT4nychnzQs3 zh$e)~Hnt9(dZ@w(j4)zFZ=?!Tf=u$^MXEu}?{UVWDh+`xHvReiTQ|{ms7yt#Ee)kl z<`x>ymDpOM1bH!TtmUf^(O6L=fs!-s3*%;7>by`~DR41-5d+F#&%r#y5K#Gk)}y+9 zU%a)G8^%_z5A7fwhx3xCaE8)?zA@3Wq)z%(}-8>zq~Adsf?>heg4mS={2Ic6_RJ}0A_hVBp|IX)Jl%g{kn zgkWl85=s8RI!8m;?kp@b@r=G*mZ-9bZLrIdobU+CSBBk`&VM=}79P=Gi6DRJEG4V92!O9^yB~FzoUhYIlZ(;OWl_80 zDQ>Ou3j5`xh=PfwALq%2`TH)K2_DeA94U~?JJPvatuZ$~O@;X+;L80omwx}RO#16? zUIWK3^P;`b;RZbY^tqnGZK4SgghG zeQSS^+gSQTW%(U9$)kuU_+=MtsEpgN+0-RjB+iIzf0mkdy~~~yV0KeBN`~P=l)mWC zDJ+jrwP>c2>W6ORlOe&{cds4-72%3-k)aDnf2E~#^_te5kL^RiE-bQto9Z#HME{}E zb%1(NSlHK`*#llU(RZ2{*|bE~DlHJ~-PhtQC#GKI2xBz*o5qIj!)7&9r{z4Q@xnMT zi0wb#3t#VXMFu}S&k!@OwjvT(QK>5UMQXJ5GR@0NH?CH>KElkIJpb(FLn@hGs|2$c zolV`s!z(F(@@l(xa8$M0aSXk?=U+k6Aa(Wa-3n)-cLF1dXQXSC24uVQ%b(_N5~+)l z;6E^wWU)kT=7oJ6^PG7h^sev6gXrUeYn+$k%TE2(7jOwvU;2LQJ(c&8=Uc)gs9V5T zd(RXIMUA^nQ8->i7g(jf^^_=Rs}SMfA0nI{={)0q_+(oeqx}Lgu`<`X5v%zN-oaM# z(K*j6z6#_rRZdZfT3VarT;ERXmE8G4-~||aP@D51@y_dg#wm~}#|HQH0lk<4lWU@3 zTrx6gW81LEW{>At!cLL-6xHvJR9-A6qG@?&6n|@0z9^kVMpB!0rkbOm^~?(Zjg$6C z*+i(tzJ?7b>Zt^f;OUUOC!&lQUWUVrLBGJf_q3JR$VIY3{&fPmLJ#tq>_+Y#6-Krt zsXH@pw6P_Vv?Cw($TBC-7Tj8D$a9svh~cJB5OWa9Vvv>kR?MdlFZ@7%>FG?v^CVec zV$p0`1|K-ZSdm(O=&1B<+Ah&X&l{Yd9&6Ys!ocua?1d-EWGi!}VB|3uN@0(~xJ?md zBoTb9P*9vX=&j?|{)kQef>|Fc9=%*K<b{=dy_XYkd-wU*h2R!cfE6Z2zOBrXq2-@sc!}p{j7xr!Da=FPK@w=bt`@vx7%b|9)&q!@N z&G}W#F1fsN#A&zMug0MQD{=$8bK0ii(z8yx;1k2?kw||&@aq0x-hC^nRxv*nc9u4m zBIBA{Y@bW$MB{Q3A^7+eOZ?Aj=*X!W?sDv<9ZLE|=vddw9|{(kEvLze_6=n?Qc7u; z4+G^?h932(nP&fC5oh$owti=@V-Uv1Y(Wh&4C9m(k+!36(rWScWv?Eo_`jF0Do&ng zN#5k5x6u4vaE|f^~QAWr@8Gk=LQ&NcwAME*#6^9)y?r@Z`kZ zyLgBAXXoQgUaoc+9y@=EHhHIQscW5=Wv!~x*G{6nwhs-fU~1(@V~m!okMEE*99aaJq8~X!^Y1`&CFs1HN_)H z-p8TeZSMZQ{`mXy3_Ku=t8K_U5Mm^&Km#zVhg&^8vx`(@heDTcO6Cc&iY(H1o5!2Q z&=IDqY{C2`4w9V1v*Ud#%l)6>Cev1;J5Klf5{^Y$2+wv2Z*@+07CW!1uB&E96>$8N zwSIl!m}lLrH`-*G?4D)X+w&`L;z-+j4w>kiE`~^gxp6takb#>pyWxKM)WdTJ2K)6a z_>V}I$iVGJSdZ&+tH`Bx1*p^M^F6kCN=XksH zdO)V`hD7p_&P}J@Iqs^zzT8C4DBQr4qbvevT z^j^Hi?)j8G?b&-At>=;?#11g*`hZA#okbpcD`OfB+GwT}!=LZTKkA^OrBOY;>n>P* zq&SSKd|Oy!ARw+8B9dnhubbm98T&w8L7y7-c@!8!`J9s`tP8V7Fq%0&#_Cv;2>-Rpz84#{=RB?rr|L(6i81~#p-(UvlY?&~!R-Q!2o z)o?(&Xzjw3@YtHrhz@G=LyUlZe^~zu)H`keBX%ygDT@4s+BxY)HUaz3ITeHVI&}8m zp8(FiMO!r%%I%U%8xt_T+_U^i`e()Kzy9ES&*1CeFp`b%OImm$=`t!r&pO%a!F z83HcDHap(DU47WKU(sa`+V{A+ilCYIXp#L|w63Q0kMhKki>XR9fcM98+AUa!p@sv5 zE8!Gfg^k{Yu0#(hWPSvf(yqBaz3kqKyb>urC~}4&1n>WG_SR8Vz1{w(0ZJ($sf2)Z z!&e%%fJiq;HwZ{bHybIDklb{)beEJMuwm0JAiV+U?(ja__ndpr8RLw5-rw&Z$52r= zYprL^IX^YE<^=Y?-#>DnwW(H^TxIX*7xHMG~t;AazO6~?%k|y?9r7V8f$66iUw2Hvj5NrW=f& z1+u`0iy@(nz6&mya8Ns?pe_O zii^ozxs@QfcZ7-U5m5y7sICuM+O($17C_5inro_aH!pTZJB8gJR<#<%00p ztd(t{@<2vU0}a>ur7~>obzQXe#uLCoo)Ay+hdwbEDPccY`8pKovu(;|45Uy~jtgNa z?W(>{Q|OVB>m)0RUDZ||0tcN>s!@@g7y8CQM2<<5H|+4j4K5{&U@&M2OWX}{ck1Vo za_ylGMz-7YDEg~jRp&eOCh`3r+(R~S2C-$yA_k3*ob;QXcaJjUA7O%Q`n>yU|N3zK=y5t8i<5B-j$(S^iml zit$;a@-(X|+>}s-xd~`@4>MVHwp^tX!M70fjX>Ei{*Ic>I|Ne1y^?ne={o+{DKAP; zBN$*;$6QXW-tVE;4YwkbNugh78$OO5z^O8_EHN<`er4EqU<&A(U-Zz{I7l0y;r|lf zux=aooU$>B!;q6CpJlk%T;KVXoZz~I*5+!AOfuYK6ht-a)2Mw!VyWS>8Pb0|0D`+d zWN#lH59PC5ePQ1UT2kH3N;=2o0i%*OSax?$SdSWmK(xK&wR0%Pf$FS8Q8UIu+Ig&f z2Sa?ME0FU2B6x8-y0^Th@Adue!^LfBU?FL#wk%kUl)eDyw5QEAs$9)Jf!V4mkLv1@=Su=zyx&K?fS+8^81s{eMrXo(7PXCH$X1&uzq zkFiRoJ>I>HgV4~jVagtNhkBBy@ya|%&0{Em3IlQ+8g=)K{65cCN%gm?ti%aI(8StCel*YaVu}1RiG{bM2H6 zPWmax`cAUwZ8DbU8UH(@IjT+Mz_(OYVo)DEU8fH+!4ZcL(j@0AN@dlo!0Qdh*-N1D z1h{gqO&VniCI-K#s>X)ObYRQA;i~DO2}8Ck2rva_Gdzaz&M?^pCVm4W4G*;P1wy{^ zz!P5-ous7l;291~wY6QDeSjA0jd@&n#Wwsy#V}0SS7B0;HN7P!!F7^ytD7YcvPH!1 zk}x;Q^KXGfcO{HuC5%Fqm6J!AU1T}-faoeID>V?EMLObh%BVx-;lB?~{8#*97#mGq ztdQiZZo0n2tBBTE9DpbUY@>vnx=m02cbwzD{#XAmF!E3MM<&LXI1L$dsF#Yu%L#R| zDN@-lB(ZR#$6+R;V9i02K9Fn%G?I3E-Mv@2llPqLi1**T2>s`CT>Tr${2p?f<*`gSx0UsOx#+v_tyg?|(}8e7}nW%C#abAv9TW=lTD^rTa?Z z2SE4PQYE(Pe(u@=TTF0=zYo7LO3u=6zGa#N)i)k>x26GrRD}wx@g?O^#PO3xdzWHc zX9FO=LDVZ|dxh#xLm0q3+$?}x-^xt(egH?y@#wVctS#SSJJs!e@HPlXAzg&fd65H{ zcN!e!bTA3(25|yuH%$ttywyUx4YXGi}c9( zUDinN1%%K9s-MF~ouLlDG;l@6`!p%C8VJ9I00ZF4W{3D0vELB%w0auM9OI-%dL zN>v5mutqK@TMiSpAM;_aTMw%8nBQHCn_OELdA02-O}$?k>BE`=45B=16uk59E7Tfb za}4O0oJqj_-DGd|*aY=B*r^Se&T3g0eJ;l`2FC_@{t2cYMn@wx+Xg7xebw8xKUZx! zgs@hSE@1)Wet$k51AC}%ihqpwDrjoo@dCrKmeX~L@t#Brj3@%D>8%;BxspnlAKn;DU0>R ztSXjOv6?^KQ-P>TEn4|#yZLd9MO=!NHHG=-QHmRVo$`{ucx&Kc&PRC-^4JBw62TBP zXHl&3h0vZJ<@2&lxM!pl$T6z?4zDNg>~O0mf+Y_Q5wRYA;9MfpuIX63VVvTewe{Fo z4H# zkO{m-E*>BU)@-SH^<7MPHnYRnlOF+Hwe8OrP-R#@0wRH*0BttT^O>z=g zF9%`1bQv9Mf!8+Z9m%!=bnP0=PXQRAN;y4&;syNbFcSS$b=@DpB}6}FKBV#<_W9@0 z^-rUll84}O#@tHeqi#Om;=6^KMquG zu^BjaB(e8UR60v1(AZ_9fo?yb=~V#iD$4-(M!EHa@6{}s5P8MZ_N|a7NwA#@{}D*+0JUJ)o&>-zi_Q^ zi3N&dm13(akVH%V%Z!sh#E(ZJq2o-s*I-I>kmUdv>-a|_Fom7*_}&ua0il*IYZJip z9kH7TD&Ip5LCS44xYMEmd>Q$`w;ZlU4gif%HY(xDLRA*U3vWk`ltzgm)$6x<4w40_ z0d%+uU>rb|JQU@|YEKmwxYzdt-#LMSgL$vWj ziNfYjni9;%ZHX>kx1;EvXPNxy_Zc}d_k&)Fz_vECwS?&w2pi)YvaD795Ma&jL0wuS zFI!4(`VArD5@e07fZnVIYVf(VnyQJnx=H@35bU^vcNoWH_wqBxEr?9E!CN`>P7}jV zbTok9@|yC=&mALUmENIPp!99et}}W@!xh;cn7SSu1TO?2yW$lAGxC# z87f2efiB#IfdLgQ%d}X5=>&3;rg>*=a&lI8e_ns?@h;IC6mwWZwb|}_zPnBteQ&jl z(cvG`#U!OB4vq2ufSeml#{)Rzmg{%MvIoFJEkG8nk@Gm{lvyJ0XOQSSN2dhOg95;_ zW~ic`J+JK?40L&r#RhJ|c!2g$pON>;9j3cc#@>p8#S9Mbn7nU6u2A3i%FoJJ1YNm; zTy$+(7s8jnbDXA;Jz_KWt7M+Fhf+F%akx)VKr*S7Id4%WU1L?S(=F+ij&SH2#}E1@ zvqaAj6*53}2I`Ufrx@Cs_~^g$YZ|I4P{f#a0>GC&hUHkZcNE#l13$26Q*GTwWptDkAhD;QtA{auUWHJ^34NZo;1F<#^9X^CtX5O=7h^1DmL*C%f>WVKH?;b#mp+9*jOrHzU;@)%n&lpZYp8q35fOi zvEynw^+|~_^mC(X2X>{}y#c7g&iQK2CQrsixR5G1&|viQij@jIxfzo5{E&XH+J8H zzb%;_qNE($_FJX(vio;6k-_@d+fiARO9_$WTaC?$tK^`}KT4gvt@R4*4GSkQtW@6_ESmil3!?8Rmn?>V?TW4$N981U;L ze2c>MXM56KSLZ%xIW}k5;uS1OZeAaA;oV^SsA?|Zar(+ zYYbtcuOPLrF2>Skc;xzWq<{kCz1{z*1=4G(0i7#!)j`s_cqul|vt6>xYLQVlYbSK3 znAz4ILwfT(zmsf#Y{=;sjcpk3YD$hGlOe?kp4uMN!*tK|DnSrI(r2pTzDc}HaZsIc z)88oJ`^>f`I;GiI{@|C>kh#1lP4&)KD?O7VYq%FaI|_>m>qgoY91z~!sCIuScLftR zoyq82N}ic_c{-!eVv#)#p2p+gI$>W?nvVu;f@vKXTHk3MBAY%2QJ96x*eSS-Hz}5f3+2t{MP`%lbpIgt z{jTWKN(;am%`iQ=B*Y?2drnK!P51-hrI-z9AVA`pB|EHt=u}`AX@dxnU z4%|!Ah$bNj8;IoJhvPG)ea^9?1aMNkQRPmSmGH~z%EQs(+wheSi662=M$RfedjYP) zY~fkKh zwZCa0cNte<|H6FX19atXR?VQbzYmN36_ET#6`{$4ofiXyJX62C+o^-empgbSw_eXI zj{>|{==nL=6+EZnDLj0__uME@3M*qL#$wK&i>>oavR~LiRS=D?s5BT#2beqm#3$&hUk+V=M{wVe6))R1+NTOn*Z5=}xO`&5)Y zP^Q#PhN^kw*nF}5PDUC*@>dcw!~&b2386*U)ob?$>5O8(020j>nNk)U-!C^`k@GQx zHRG+0RUR=1tf9tVH9ACt)BMm0VBSF3K9yxd6TS_QZ@YsxyNxtsHMB6t)`Kq}%z(@$ z61x2>m2dEMAUlI>0IM|TU59=yxEadk_poy-=pj{lHjFffU$V^}V4HO?{MGoElUhF% zECXS?stzHg4~cuPT_4od;I{3VQMWx#GS2pQRZPhf+4{yX1Vo2pJMN>gt>0QhhO()B zNT{ZCv?J|D-FW<1JH=OkSDK@K7jfl>>rQwJLT)1CtBl2oT9v&COpN1OKgpC?6{Jmi z{D5F-c%CWAUKCAA*tEzcsg^O_$W0vP?URK{{9)MJP;QFlXUyiB$IDg{H}FwgI!v`5 zYsf_wTukh`Hb_5P^x52dJYn@&cjn#-j&^2 zGp&o6*eQz!ovu5B`(5UjtW0&O3sOzacZ>lAMp2`_`QsVLbwid%OBTs22Y?Ly&TMYrD;b{3i^xI20ig0;0=8SC*mw5bntM*Af zb4tzcl_ZAPgYwSx&|ebX!RJ`AOoz(^DaV@Y=aOesF5$7oJ=&pq*z|hsP21jLM4~R~ z%j~nh-G?&g>H@{15jA?!WI{evGWJHZ!^oT;#61Z>)o_5#Ea5N<?Lm=x^wUjZMu-l`G0-0h3OrYWing&LK6gA@s_Y8>(n0)W{=3h5Zd&A3SC zq~oq|Rw~I<5+$Lv`=2p!+U9%0WJGO!K2_JM(67)Et=^%J;~eNS(X~92Vs0xuaGw3L z^iYx%qp?#QZ->B~i_`cnSrFIdy)1P-)y=zfk687^jvr>1yoOLa(t4{vTw>(kM3KCW zL?hWua~e}wd@r>h+QHbl`x4guTh59>=PoQn_vUVY?ky?~>5^Y`1{1ZVi#RzWzicD& zw;Yl-BdrE0nGxuHq_?*nT#$IEQ5X#K3~p1G-7qYIal{QN3%hb)Q-Hd*Y}05$YygXO z-!tZls%!U`Xp$MocaZ5V&&%A*r(h4a{@vFnkAd*|nA@f2K5@!m6K}gmFyD6jTMB}J z8)}`6uMJ}A)>3rUxE&U&`Z9)dTG70w>iVfZ->7H4{oGh}frY@(aJCnFY13w}@`O?s zcR$6S;XswK7GMFgZGm~OF6mmul($J%bMYV=Tl%fkq-~tUrmV~e5!M(h^gO{W5%qXi zDA&ntvEdCfihR{~9QXN6`j}M%v)67HbUpzL7n(Z|)1z4KE$f0)#h-)%E4n1+vam(> zdXtFsZ}`g?SfBT>RMcZE3U{QJ`MAnro5B0eJw|EbEOD(?zC|-`C=qqWM=TC+aJi(u z^GRvTMkm0o`5#NQp7_j@1|0aHZ3Id0s?A7npkxvXCl>om2$^F@^VRoMH~Xf&JDW6d zs>q<`P4YHp(^NglK~4r2lF{%mo&CVOWwe zTDBOCZc{>AFH}|Xcz#P!Vr_X-vHrcMiAtE8q#Bm2@&+$5M~T4L4h;0WIG&1rK`$xph_XwpYofJEytc;{>kNVs7@|Ctgm=xnq`{F;7Fi z@-zR@eluVWz)xdQi5bIzAxrT`JR7{dpx=^e(<9nkBUd9i6RKyRZg6I+V5hwD7SW{9h>rH zWd-F^WKQObKo>gO)%QSUp=cc5NL=HUZN%F6JRXCs9anF!-pdQRl4`r z&HYYOnvgER|2Vj8u}7Wloe)`Lytk|~+Eq(mU%4Lr4uO(5j8b8V^^lR8cFfklp<6O3 zqp9yS4(6e1N={Utpn862Q{Hk)VBtC}_Rt&vQx;GqijdmJFn4~{;;9#5j{Eq?7)vW!o zC+lohbC>@(kc6ox@;s40zF4jnOBhOq+_w@*J_(yk_CQEvCVk-gQr0J$#rt4jme#W zBS>{iADI9R(f{V?d?$9Sm5y^jlj3Ah#f~uyJI#GL#Li}5*J0+7l-(9X`x|hXmPpVIv}uMSYC!z_`eX|kwzYMZ&3D2Wu4jm$ zs>Y~+?*69A#&CgQ?s0ZVlG1aER;H6*+@>UJB?8}0fedkQeG-AxCd`jGr5E>2~R6XopXkLxPS^D#&(br>OYo~0Zdhy5A15c-_(GVryr%C3<9 zYKa#nd>I?@5)aOVrO{58M+p|#>q}xP99k@?xjx{bUIDn;>_gO zUscl`86kf1!cgV)*iGgxPta60#gIlHgLh36ul}CG8OIKS@<^TqgP8629-s$2Vz#-* zJ3Fj|f_~*Y6tc7_HlWb)#}A0jj(=_e=&drvdt%Rcz8o*sNO|IPoj}DmZKeq;S!hG; zbz^|!Zj>W|-oi zEMYll{*A$(o~MkH?sA;LbrFP6%LfIFM}AR)HF4N5$B5~zH(7NMtW9ku8dkmlcLBn_ z6_E#u=9?{#FJ6=#Wh=!hI~I* z7FTk=iRbKR;O2NSx!%W2TGyvk{s{bF3w`~uTPGMW3dsX{_53QRMxRt#^;g`;65rn^ z_^mq|_dR7i4P2Ub>YjZMrA+7(?E3f$2(8~HrTko!QJL*!81iuAO7Dyz=sbJu!iJD3 zNORTc)>GstEYh?7*MXSAP;`}BU=lsX~>G1A|XBBgQS1bJvW)$(3BWO4opeVASIR+iEQ`zJOwh8;ftF+_f`1}G27LZ9hu z)iXXCC}zzj4kG$vFdl%MJ0_BQ&e~5i>ka#yib(~q;ud}ueU-8&<}X?2je;z$&ktfA&ub0-$#Y%LWfY92&vMsl^?7>&Z_&VYm@mZ% zrKi*ux~|nxj0K|hv9ReD1-Ya^YF89^n{Zm_v+f-Zp7He|{Rk+y+$TqTyOHZ8g*IO0 z%og(5CUYNYC&+?(mkGm|wOVYzA(S2+F>Qd7PieGQZ+hW`BPP8l4#{Ti@*bx&zl}~1 zJU5dFWBuxc#^s{-nrSDCvGm)`R#XE0B~NKZ=*#yR%85wE0%#`eAZ*AjcEt>`S9J!! zZ&z~d_;Tfjsv8SNJ2mg_x*Yli7HYlyA5Wug_*YWKGR`JU9f?OP|6x87H1Sa+{uy>S ze4BQ-@{?bI5j%8Q=r-I`x$YAwyys28_<>#AVOC(4lCQnI!lf=?$g6=0aA>Z|ipaXx z#8@y+Ne1kFdUvNJRt?_IWi+w_tb7hfcIR8IifbXJN59{>tB9KpYO?6cq`Rz4%ZTS!h8=iG{Y-)DVWwr8HB512E{++WJfX*f|aOs`IqGtVPI*O2nMF6^+Ncs*(9bOjUbE;>DdLXrKrxIhHvGC4v^5 zkBUBmB&qJC=FVRSfZ}268eF`;e6FEFnNTS?ahhPk-K&z{vbEn(J0S$CRmsQe^M?K` z+WGobpX8!oGJ`Q~3);gG=3Bm%g}AB6yAqiGJiR@>kHr|z(?zq3$Tw#98T zA(Xge>+6}~`Btwvlj(LXR(g!4$_?>_DP{l{g~wPqCRbhT-t!GOQk!CmRM_PvFWhjE zj~>$IZ9dzy##8GnMok*T<&p6n8-7i!A}bVA;5LH?~y1AuBR9H1;Bs3`dx~Ck|z~pmmap^i+ue%UQgj>(iA-EKzZz0bRydeuqVm<7LVD2!V#_*bH}qkm%{Yb!PVM1zI{+#+ z&nqydQL8=HgVJM9PVdjt6sisD)fb~efY{~c-RG3by@LHcB|)&*#RH%rwO=tLjC;)Bhs4Jj25dTgH!@^^pS;9Nq%Y89sht~uk%T?_6TIk}& zcjy>POkQ!T=mh)6tUc*tO8Xj)o~3BDpW4;=*H2IxrcdSFtRm`~`P+2&I3g;W#3fu@ znT+y=)4X)7$HY>a{raEPR?U;=7kmy~q0x-;`&_}S)U&D|0QL}Jn!^q>yB>85P_HxU z7g!Ev-=&a%>RczT2JY%KeGoM3aZr0g?;Gs-wPl0i%7f;SP>p!BmMPN@aTTi98MJ}o zH0tOHFwdxKF9)^&>@a=qO5~B&IAp`VCsTPO`0GA?Ow;g-w>?US!;tsb}jj~3~|oOyPBD~jiEvYBSyTt%=JI3 zg(y3tQb|Pz*5EU5(&`}T52DnModjzPeVmhO5?og~QZmw?u+CIqwApY*HKLy@QtVA? zh*Al$Mu+*w)TzP{z#H5za@1}^`Kl5uhcY?2FZbLc)!~g@$Xi{d5Sca*ufNT%ZZ_S( zF5HJ;AZ{`_?Qv zf|NL4+9jZ|jTzg_YI`@_kPMnBRn@h3LC4YQ%NGB#F?a(opjQg%dv#@yCXz_u|tDrZO^GwW;$_$#nFA(K$X@UR9WWjymFU*}RKrmGD8? zw&E~$ohSFZy6Jb=lnQhw!*+-KAtQezL&=PcBxQ&{Adu62o{T>XCpc66*r+BSwWy(UGGIt8_UzlWpy3zIXh(vjK1TZYNBPHIi(9eCDZ36tq}p2R z`wIk;+BmuB%;1DWbr8n#L|+_S`EUFIn)Jp31{(A(*RKG(*R;2~o6?;ZacyG1bgtJe z!JYNJ%yy>S+(~m z9&vqWS7DVL?>klQRO@Mb?17o5+s*N&?8KkRlx={|!zxSyI?wH^G4+5G;`2w;{Cc@y z1j#==)HLUlhB^T@aLpzQE=OY70@e(LKC))s6N8ZkPJoJ$i!-5zy_W3sQj=ag^O2L| zhFk)Xk=$^+wLJ6^c3;G;>1zL2intB$*q~!ePEi$hkyl zSz^);qbsd_TYVku>J$&060{kPNOH+{W%RGTbq__yw(MN?$q>aoN_~d!n-q53%x|aG z6(Z%pL(dJ?h`Cl7tcm2qHFzNqkNbbjYKn)Ntb){{P19N%|8iLS_=z)Tq|9g84K$&w z-}EH0y#?h*I>MSsSAT@_&zR}=u;wK~CO1~z;j1#{rV{3Sd?lDhPkXH5G&5ejKinA? z*#|B&*Iz{2G&B)yGVEEhk}F4`^n)s$*kpT+EfJWe5aiS)Vv+74Nc2Obe5HkemLs_^f7VN0?-U(RA1k`G&PGya22$a^&dnNSPmSC`rSEbbeP^UXBMX8kzZ(x740|~Lc!v}zL zi=XN;{zJ&PL(j+8a-N;19MCEnzMai7% zA9uv43d#DE|NaLQfX5yq!te%Dq)t7gafN|Opt-Egnm&0pcw6~0zzYEuIeCYufg?&RICiCB8HLE7d%r7 zl|u{BUj;v}fvsgpI3)Y!{mNFFlQ=19dv!KLm3VB9X8T)dU9SO=tIkRw-a-<`nf;o_ zEwL&tMzQWwTA^`D+QoiZ*<3L-!DU?fG)sa=!PhMNtq z@Blr_u<$BICqNG} z9#9oUqnL!wnwHLsS$fkY;;{VMAzp{P@4j2FiW#`0LOs0yaQzs0n~jI0a#O3FHw`O- z9%0*2iHNrRRRD}?J#vwSUoRW-ElHOSQ zcnyv4DvwAhe0Csw*(F>HWyf;N?(C|zTT)fWO{SVlKb{^IluxsBt-82jt)j<@-QF&g zz5J1|^1x>l?kx(kxNlcv&k9$It0@uVUh>pizYRB84H@RU`h}JdZN`+TuZeJ9^zy}R`zX680K3pb4A0T6Um79%sIsClJ1IxouP6t*ZZ6=1z$zZ&L>!b zS+Yg;W)4^@1OAG*qzCkUgUI zijXa^T{MQHGy5aMLYkz<)CgrV>Jet@$V~Gs(c9|U8~!y)pJJn$m}cXQIgnxlx~iZ9 zRjP3j=*B6(z?;F%s4aCp({q@x$*TNvo?c3PwX5*tjR7@BL~6&OU?(5FolkDE*Wyik z@O)qq+!amouKsVJBqA(m%15373FLIz@~eBPZx)~I(M{iB*zh*7ivaVO!@F zyMh&M-Dj-C6VJQXQS!VoR>C!gct|Dl92?z7!i({pm@nF*eua$h)kSE=0h`uZq^F6Z zDn#|v3Re#8-y1I&e0jJ;U+c}Z;%~oEEnh#RoXG6Rw$ZEe&A=kY;$Ph3^FWl~AO;tW zP#eY9H*YiI$yg+L5%U9a?P@&3OiA%PzSu*K`z)@Ume>z-+vONvUrN9iqMN~Hr8TLf zz~%Rlk$m$r+n#0rz%K3pWUsh^<>>uYHpGTfx9rnOO=2V${-8in^0_6!D#mrKhk}6# zAiDxLf^b<=Sb@*VLp7DCRAyPD&@Ih?fzJXOR`vA0E$q=hT>EQ3ANb^Ib8Lzojoa3X zmVE&3KdZ`$?<((gQocRm(83`Q1dnhU9f27<181$-miUjRuI)?A-Lh@VVJt@Peryq- zK1=e(7<1K%+7FrIMoGTk%-@Jt2|YlE(>&>DWWf#$7aI2LUP#W{cpLB-45Afd?QlEV-b!aS$kiETA0g_+uI=uSmB?BoQLX{dPK$;%mPuC|CT1HDYr`=S=Pbn`90eI|EV(aNcD&o&Ug%Prvg z=|C8ZVw@+Q9K<1ScdXnNO?uf}T<_oL?RBs$;NQN(wr7!;V>c|n9Qt``AUUYqjaS@@ zS}1r#3kTw~t6!TGDWoVNeY33UEXs%|B@Ygm1~_0?=^FEj=*!9XljTcx9N>rxwRx1ycUOt_?sq+*#0w=%D$JffI!#CCGt0mkin#HRACS3!v! zzdxbpeS>o-gva(FwDld?T=2Zu`U-NNu~-mrtj^S=0^l?&4JAw#U=Fe75>UZs%=JO?ed$%z$Zm!jGLMxW?L+$F+*nm2LgpeGRz{j%KjBC_=>j0sRNP zGQ5gvt9~d89FG{jL%4*Iba<^+-jSnZOz@yUMzLROgI4+&OUvv?dkkKjE5`){s zr%bRCjE;sV?uEx#)X%xn zWVzL^28YYZM(urz;3xMP39&zbT}$=P#dL=qH>Vap20lH$gWJEOCJI~7!eEyOv=IBX z_oss31zcNNu&^O_G62~}lqW&>tS|rwc>8XU0=tCLUvh`sj@_PRl-xs;KYgLD(YeyCPiOB)z)$GFBGeN z$o+K&3StuJl+_k7;GaGvGa8QFSiJXAism_7cIW$iK(Zt8Wan^LMt1esiXqK8=9$w` zv#ikQM!`4JE0Q#&>AfOC;a-Llk$9bdl|inZ$i}+}QW{ok%dMxZlOZP&%IDnB!ix=4 zU2UU95*l9no-+&moh(dYfbrr{NUYxv> zTjP#8!OKHnTTT2ekBud%6LeG5&3)%_tFbkzt0g#8S)7Gnq$v(JjNKz@sl=aGR%W8% z73J2XLu0pzFyr~>XyMP^pBsn=%&@lWjc_p%-Vo^F%ga$YNxfk5En`u)B;t|VezL0Q z^2lCh4>f}P{Ngo4l>r53wIk!TOhUplqSC3}QBsAymLhiu6t-(LQLsn$M6*FPKB$5s z{W87Z!Yx2`HMXu@5Jrk+)EWTd#}O_49uw`pSm}k1`ciGyM@LIKN)!pfYS>r2B9!BL zx&L`i6CTy4-uu`kuvYu!Z!!{N!1t)y%+6gq+iyzWUkQk(XylFI;0q;~1vk76HCB;l zP5kX1DSBz?3Z%G#)fwk~_a1G85uo2fp_Z`ss}Fmz(Xr{j(lzVJ{9x#3;lY5d3|z0@ zimXmaVMZ71%W0MHDW?m5UfCmc$Fj=9MRc-reTfW|5cn8bh{qdf46hV*6kpZ=qCBY* z)n1<_mbVmNS)xfgWJQ+quc_%u=(p~Q7R7YurU}OkfetJ|w|&`wFh;9$83S9ONUA`l z0k1MFDNTn!s^hoU%(r-=($v=tVBrj}&{E)&7w@5~iDHCgCuL7WH)~S~*1#5-!Ik3x z3DJCQJRFL-wIgY-=(LZXqy94XEsXRBstz)x4|@|>Ci#`uUcD9UO^QfZewWa|h;F6F zuP7@oP%l>>eoc@;u?4(SrgAC67sh#8AMZcLc$c@Tw~w*<42WO;bua^hfJ!9CH-yRX z?lX#(u1%V~cOtvL38Qa6VagYJZA3Oq-6R^WE3CvM8#1%YJ7dm)EpyAxLOiQ)s2W+I z8Z#NW?Gtca$e3Lr-*t^{$M3YpZAP;K7Ih%8sXH47?}~($V=oK);jd;&#dbY%HuLe^O|HZ2O*t)5qp8(uKpv6H*{*<8+pdH4l8* z2&OOQRLsU7yP|O~#uis7(+T^=Mz0WWyCq8t0lo7Vri?|xFpKwpl9CPIdqu|axY}1U z;&Smlhi_S>{@D+PAY~7D`Z*=SQVZ&6N2m6JT=|1Gi$YUnrV-wnUCT^X1+8$9P^2Lm zw7~4%T>~M5%e3y49S*P>t0L)MLZYR5n|^Qhy^A}2?(vVG1@P>$LmPwK8fEz#mEx?s z8y-?PD~kqqpUbvBle{Yf5XrR^aqNj^&0~jC(Tu`F@p*tgvyIKLnQu(;CmsKt#OQ|< z9VJLJOwQ%wcpyVn-uZ~69{mFE7peOw9Qs9MN3idsxBv&q9vYmLaG)^%$z=Q6xo9oe zvAEwny6pH#g8;iKOWU$$oJk)%+1~(r|43>h_ye{G$yTt{X8I$=*6nCMmri#)^c2Nh za~jy&f^+kXA)(%)0&?4-r=1{WweQUrEKNz#3tqEZxv92zr8Ql$$kY2>h?l~eJAXUc zBfz@kew{Vq5B_tRAMr4xbT+8wNV%1|jd?YUjO3q-`EhT=a^>1 zK@u>_%{jbqeBlqJbo5>0$#9r-)9S%tst#NpU*a2+=V|QXB7FbTxLr!T{_$m-Iah$6pkQWxPi4Y!)n1Nr0ANjh{bJPrr4{nkQbv7yB ze6lRpnl|!-;ZhM#APDX=_ut`Bfd|inQBZ^T{mK$&Lini&JlN_S|UdIj|V-_+#cU#i%M z?}mMqZz1X0k@_RJnP!@4P9kdBWJi9cL*hEO4cW5UQ|Q&6a8<7}*o>;K@(AkaS|$Vd zY4v;`!lL{D$5@!~I{7`Wi?POA1J%P~xiVnSQL#9Ojhh+Ru*S>#LJ zUZOxftZI{AA>7d9l4L-)s3~Lh8@qs9SH6ruSmBw$%Dr?Gg^)+Qjy9&ugXYV|JD=2u zw5lz|ml*QBOYZNRrlHa4@iNt4y=7FJw+m|TAecl;!hJ{EKyLN3^g2J#nM}eYEp(u^ zxi2|tNL{b5$7Sv&Y+%U3?(zKEx>{Zxzrom^0`C}mRS%6~NEj^%sw5cgKOfFbSHy^8 z5*K4q-=}Gd@>f;SI^z3l1J%SZa!k&de@^`H3xXC;Cy|k=>5s(Ylfx6=M`qJNsvVQ% zj(Gg+ia$lq^i!W1>!#zQ)l<<#q&ef11cP>$ykID77{*Ikm^1Ht(P#M1Mm>q(z$xf< zVZrGAUh*FOFP9$eC*JwUkuXmxgnRzDUsoZOpB=)53_b5s&W!1l_TC(<8p`@XI~^yX zN?MyA@cN%6zh5S3bqWSkZLS=)E4KYGewPpY3|JTy#`vU3 zX?=q4#I9~{Dp0#V?O`v6ofE^s7tGMwPiSbGUx%Zdu7B{rYB*T6{ig{?PA>*2du%1A zrUh^&37U?pe+HQpyt5!BmZ!uRZ2g+{s3U&{G845*|6vZkBPE2&Bbi3Uy|6GC``F0R zTkom8+tF=NN;(l8m$P*g8MkhLAS@h-%O3uq3lF;hRO>4jv=U#dE&(ZE2Bu$Le(TN7vjtK-2~w1oxg$=u)-qA+pTExBzjkatezfzS zyiY%WT;j%ia>VrJJ2Hr%JHVRSn}l5>u5+U2lSV^ z6u9JXaljY&rFLVG zy2CWW8u8a4rY{BdXo}jeBXB36(60&QTlY-JNbdz;0PEe2iZ*c8-X12jnR%0dv4Hil zeUkEPt_@J{5J1g5PenAsNusfJg9@2J39a6N*q6j6-u_l|K54wK%oYK;iC-$M+<70= znvYAOecE)hA)&6${r~d6GT)$qrSe(ZUvna4We<88GMo=ST{d`sP2eazWQ+Ox>9G6? z>5~%oLg*FChw9Ww6UY9i5XsE@_dq0~Q#Zc3y!2DD|J|mK!;a=zT3|{HdyvpfTK|_u z{O`{FV>CZtrhK_#VFoVY|M2(!{SP~7qM%xqHj2da|I>&2uNM9PpX9TAen93}g%&%q zRQ&zH|JU#O-#^3O-wl)BbM4f1(n=V#0vR-j{N(pV+hASp!0l;$TU>kOC28Fb6L(h%8X_VJW$hQS>9csFQLq3(^m-jOPV_$Rz*oGhMp8_>MbrT4$dT^d1<%B`* zJS=9%>V`7s0EFb#6px)?TNz1DpV8NZAr`x-4^1dPdXs@qyx12E@6+mx0q}kEo~qeo zs83v@LbkR6l3|w-BJvWA0RJmZhIjB!{Q8jZy8rwSabChGR_uOm#H^C<f0v&QIOheo^VPVDXbcrVTUH$p7+c!kN4I%en{XAd^D`~KR!{3j*B zt6e}8b2$lI{E|rQ7u;*KJKy|8Xg&!%`Xcyl&Xx)NmR{z@lG?lj;g!8W2a_1UoQPqg z`|m>Yuyd}yJ47g66Q4)XGS(6E0V00q>S?$4THuuL2To}sUAEtGp8!v#pJrg7*uz2J z2sFu)pdKqDy;^w4u=ve0BT_B5t*EyTFbOzcsHpPW0YR`CIP^&{NYMkSPZ2>?WdJKf zhEnE|B`rLX3V3UH57LXU+?(WDiw{dlo<#%XynY^~LfA}5UAF6Euh#;9vk>GE(pQ`j z?p9r)hmIhlW|l(6XQ6@@&kzaA31~xSMOiKAA;+l4NKtT_OPQ5rkalU?{#5jL(6n%Jfn^FZoE9iRyb*8RYxe=4+H z!=`W4y6BUVs$s+yXqSF!^sH}0^-vo5Ix5?0G4fkQhB-1gxgn2UcK{x zE_rs2NS(A-@Bbnn7yfvA>Uz6|(^#D8W38!)M>dYd@tI*mTS&KW;NMR|cLfA> z9k+(DUB3!L;l#cQ;qG?s!y0J@&WlP$Yp56z(yg$YrW!^ab6tu4UQ^AJM%Na@y-`&r zE_R1?%8SUVmp|!|#^#py;Xrd;20V#g2UpKIZdPpR4Uis{=tIVMl)@&y-i1v&VmYuS zkUFr7d6qa|(~o;%4p6O{zgis~upW~j?pZiE8kXjfm{JjF&LxQaRO}p}n2s&YJ2<<^ zlKh0CTMQ9hQVJT^F>_lYgt_qdH>&pkDiQw*BP)sbTAVuw(e7D;<=6an>QQL%&Afc(ybN zkd`=~Nj-lC>r>#;5tkx~q|4>I>yl4wwStT1MX^nLuDdN79)`dhw?N%wciuG*G~qA% z)e31=hY6d&$vz4E`12cTmqcPD9|@-c-0-8@Dy;+KIB)`_NYetSx+8>$S?>#P_YT{U8$I468V zLi-|=rz`>8*4E=1$n-7iEU6`7^ZHbJZA z9P=bRS=ZaHN+qA{xg}_CDGcFEJ#OuQ=(^}d4Ip&VPIXTGlu`9Z9rSf+(aspnC}E!( z>rfa&8SA_!W?QAWEe)H!yD*puKUC*dhqNPXV`P;io#p3%B_fkP2R`;|fGI)V1UR~V z2c`vU))_ml;0DIEE*=yis_7!?br1kE5yG%w$qx*=6hwNN1YNWo7na%Yxr3kk{OTOd5|MdW;MfqSqY462`zjMUgo z8Bv3D*I9$u;3zc&tyM)M7av<|}P zU|(vSmwrmi7pY*S#T(|l*ZQihSSUtb3Ex}=sqOW4^@{ZIW8Qg zTb9bPTTg?CW@|?}yY6|CRwrBpp#MW-J!QBTUaX;iIS_s^e15)TyFH7puGFs*vUJ^D zee@5F^;$y79~$djlz(chog(*{A_FGY`w0%gk!QC&3!dCyL!Ghm4Kq27HI=hcLu8x$ z8usP$VXKNdvxbgtF#lJ)#BWzS(&Zm)()}@-MSCEmR&tK{@P?Bt1KVn(4-`E^SoD>A zY=K!H*_E^$N0Y3s(EO{)sV|=6SrI;jukp-VEvtl+AKBgY|6b2=E;YWaO~ccH{eiWQ ztzWI!wf?hT{rg&tP2#2zE~!FVXo+b-zt?r)XVINVEc;J~{B;Tsw}9;cbL9gsgWi+X-o z5McbWktzC9fjCfJYh?Y8@*2Vxyy?SaljH}TwbrbhXOO2t)&KXdK#&JvT0f*_TCggq zdg%o%i4T5HwaHksf^};JAF#3gT!PW!Q>t|F5@S7?ptXYrJGHchN?IrMp1{07`m0t0 zT$G0|Y2{?dr19ijLeUlWMSb7XK@{6oO|>u85Eh$a2j{rFDqF%e3z=~i);Ee)Z+^6P zAbS-_yKKl?V8xWStHpH_vWv1H(cgU4@DJ63Fyqmy*5e2+shzManoTPp+4Vu+n;a>@*# zt2N%Y8V5@8*%*S3kuwl6Hz>zW65{)z_k8$cw4{E?`>-QldIH!VPSG@k7(tJ>l7N2_ zgY1!wgP_2>=MbVl+?&A814fXAd!nxX&OeJv&nVuJfXx|e@i0rPb;)7$;u3Tz2_+LR z>g*_`FG@j;*v;V9wNlKh4yIr;txdl$Cq?eMJ-*cs;-b0)g%Q&a)S~SbulI^8YAis( zF#>p$e~~LkfBE<(&gp z8RFJ+ro3b$yz{d0tuwfZ9ISjlT;$nYrCtVew|6P7&iW%aqi=sq^r?LM z_d@2$SLEphdcH|JOq1VLA7<=LGS2V9PwUQ?L2wd_o!xMXp?5P^fuR)Vx+kT1OP zsrZjFi<-izrV)3~zcDA2e~J{Q?CzryV;F^B*Ig3Z+We%opBV3RaeEmsRe*w2U-)z= z>s?wrKZGMKw85wP%0QOLlwW1mty4LiqxgvYT%0NeUBxzBoS5pa?DzF|TT)rX7~<%A z_fmeQ55PLGzGl)^7u)CAK^wq%E%nA7ndYoxDZG+E5gDJV1laGx-cNR-W-xa%o^2k) z)LlL*c2fNfqLOi-J-B6isqtTM;{~qQr3?F2ER8sFXb#>w=~cycGGQCj|CT=m20BWO zhhdVkq0K_*Uo^g!VjYm}&$fuUs!$Gxhb9FT&2C$!kba+1iFy2{;f(rMvnuBxwhVo4 z0~i2E(RhooPR)ZWV`}h@DD#=+tZu#^ffC?686 zWJkS?ehanU1>T-^hETF#-!t27B`W0{o_y{>^yGMvCi|m7QK)HtiFc3sP3{OO3ORk| zqnDqKR7bGut&bTjOR_TyJYGmc44XL)o+d>%i`Gz-FG;l!dx`1qN9#cc7(gG2KSITWp{MHNzy?^Rd-WL%MSrk zig2}>EUeqv2u<60+hM7bBmUmjW?Z(B0F+85w}f#?;i?Gze{*IhVTX3Iq%Rin?ijeo zU!_@m;q*XC6~Ggwbcmb^qTqKSUe<4im)f`3E48N|o$`+3#eF3rhlVP5zebim$hmkv zTpyWpntMR#2^~EVP2vJd}?t@BB+@z=T{%1oUj~a@ed0CpFJklp!{YiWB?(tDgs)o`0&dK z+mR!u49%Doo{(2+k7e@Rmy0;CUk)Fgmh1l-{r%SPsb6tIHbj7)5>>Yb$4iO5WKV0c zF43+Xk=8-KE?!ML&fVR?dw)#YSb48Allo8PFv`HJADw=sMMhF=p~}dT=3qGRws1~u z?bYz@EqnFQ@3qQHcAv+(URBBpK7XpLNakn$1|MpDfe0Ol9?{X&*bTzfQPF|kjYW3A zq2d(E{;E=M!1BPKfc_n7i6sN}q2$tUU*ZxWp_`khY`#TBUzVvU_E-5ouVz!c3`z?N z-UPjhQoZLmZ8cDgVJk=)8S{1qJci;Od*v>q+EH53C${?>sh(|&cj409%G8tHD<#OZ zTRJzyEt>%MFfjEq1wzXH{f45(Va zq|p2E1b^2Shr1iZFKO{;6w|xGUf@vhV0P#}65|<8HFXcF!j&8gIY78wQm^#QbvrQ%xe;x_Kr*BdUSX=QCQ3 z$YMqqiz=>64RB=4>2J7$Yv$;j-YH3Km#pbXVrwWO?Vi2sMdWujCLk6Rtx2h;`n=>z zU28{G*I~>-Tyq!fi%{j>3bTgcTe9W7;3b{+Z^ZDDWU?dE(EmG4M<>@ zYz%P|ZTBYeHnY`IE8dvgG^Vdi7IB<+DhUjwM2315$=ZHXQ>k_4a%-*p zK0;zxWVi_=hV=emTXgVf&H>)8Rlf0`>Ris+z$9r%{{Eb|%(GJmX=C&T1Ut<*Rp9O9 z9jw*&^wB~A9J1N?!pv)`!`fg^z%;?02>jC)!zVYbm$MWl37%+pDJ`Hy$bZe5bIQD( zy?!b#8xzeM%7xe7Jf#L9a{pq`>wQpWCaEhw57L(7u}aPC!}f&7ZO^;xSgp=8S9rv~ zmY~|06KgsF2It!R$pP=Gn&|l%UPRy*2sEv@RfFsh3&MR8dd805#+47}Wz1~lU4fnU5mV=-LblNxS3m~+_QE1c7tY%CqnRes%I`OT z4J}G;l3R@1GHY$M$0M|`|J1!og3Z2Og3!IrnC7O3J|1GDwpfG#G5L>2_|*Xr|=R*%OnR49?eJdX$QeZoOu^i9HYC`pCf zz^5kUd6-y|ra>2#NI%7Ov?(s{rhV`a(-?%vqbqOPK2#JpmzGKKfQqrZy*}%OK^Xav zbMDadtMXcywog6VV3JihI2wj;f1lDVf1uw!LLGe#H@1!kq?JF&?E#C!=<1obN6Vcq z*#D3Ndc6JZIHBvaI(OsrRy{C+x%A88rT6Z;@Yv*16z}bWVH?ZT_}0?{MPU-)glF-$fqncy$DvqyIwTS0*lZiVcD>&ycOjK`1xs_{o;7h zD*4L@^=LKd1XfYmS$rA%9Mn>1420s#L7tA}XCAb>KU%G+=}h7EZ-HB^Dt${*+i%-Q z_u`-JQgV+ZL{-Y33Bc>?m<~HIRUobxU*vGYhD#S#1%eBJxl}~INsZ&B4>V6aY&|HBY(~oqV zdxMW+*zt$bRDD0knC8R8lH;dA2;Y1Ch<^xB)q@-jo+zFU;{QW^8g*|^Ky>m%OR=BR z7HRVVv1YH6AYPV=!jBCtKzbJq4I<9#=7aU>MV*A{>u~P$WvOr-E=ScAc|b=`39cgk z@G^MJ+oLnb7!Vn#;$R$QF$q<{4nu|oy~qsmhh*^KWGGg-{-Mdv#8v75$@X_1;81aO z=a52x%;IqN51GZ5i(*X>iD=|19ZsRyEur5uPEyHscGs!+BmApk5egzL8LXoKV?Uzg6(B_}np) z-UL$1_<+C6^$bH?R&gXfUZ(tcc)5O&*@N$Xl(1gndw8>mc9Vm@Mf((XF#9Y{l%vHT zYYM0yFv~5pIT90A!b(}7EtozY_}8d@CM=}P&S!g1dZefHptH`l&Gmj7o8@^&b=)L2 ze`C#z0fUJ+q3%SYI5y+pf7n_xcP<&>j7cnir5*V zjKB9h>RZ&^Z?ulBV@{ZJ!Z$SbwkFH)Z!w&>;yI^M*K&g%Fp}fB$F6)e^o9WqN`Sya zn>$bDO0Qe|UUcrrRES>6U(fGI{~$jIXxnOUpf?>x{Rkw4#mI!rfv# z`86Uc*E#tMy^?B4uk2?W37~X~=qTN<(8%frMjA12ZJ+ilh{6df(3U#S#Sv3qI@r|~ zNfh#QeW2#oO9YMHTVwV%XqNMpm1fv(1O_8y(06GEJohBy9%d}9CRTzDbAYGrV2c0J zLzZr2%^;1czIn|qFf{Is#=R0C(&l$cjasyyW#evriKq&`E)(*gE!eDE|+u&|imE6Ex*Is9(81>O^A0j@!13LhefKlCW*Z#ZZ_tXo8l1MHvqt&p1RKg#XR+;ukwl%{XdyT)&-u`#3CCjhl^CkWa5;5)+ zTbzTiy^B~fC^wwY4wpBtv16ggtYQ3R(uW1r2Zy%0LZz4orcol@WwW-^47YPFw3LzI ze%azq5b-jLTkaL%Q_Pa9h=@%9!ZS<9mN*N@cb`!zBxbgErPNQK1A1o3a;GgX%uz|H zUud1J4Qj??a?r4}>~YYFLYKc#|HFq_{p-;$?4>>Px0j=$QDRO2k1Rd!Vz%C6(QW>M zAt99#vnw4#^g?FCzN+xxJU}l-1t+{bB2|I^`bdB0oZs;9j^ArD2v(1~iXi`T0IRh` z`Bbj11Ta%K7=GPkKnR17+%`ik7-}@6(Bw zc1|-qzy7Sc^@Vc-75Ql)Bz|m1;;GZz*IurV8cC6O_c!kMI-@#%KVe{Tjj>zyV?bBP^Bn z2>uvfIjSo^obc8Szof!&bXQraT%ee6bZ^?ae{zUA>sdDvoS2X9#c<&NEh?2w?n)m~ z94=5J50D1|=obPK&W4!=Pdg3Dk{rHyWpHoI4&)J?=GdEP>1`%dBUEgafUV4}!7*h- zo|R0|9VHoyuX+xgAcr-fw}d@VNvEQ>{hTPm^c3IeN~HuE_Ft!Tl>E~Rpq&U^hp&PJ z?LFiNxp2Q;S&knx;#5WQ&lO4N?<(-6BNTZV{j0ETkRDB{R;3SFq7cyhNzRC#*JMP>cj_!k-Iy!>||%_m#2V_C=mFE8h(3c^AEQap2jIY|8He7+=l07YW)ip zIj4T@Ui*aI%NXJ`DipM2b8$i}!N1hiu!LU*;i1b)!hucr`~ig@4&hsr>YU|S?(Qho zd}40R0^)@3EOG8GUzQ!<8jT%5CwdGky@MApLn;01QL9{?^RiEA?!5tm^KkOD1+SV| zjD7R2OG%v5NAeGoQtJJ#%C9mir*p3@FhBi5Fm(P;BQHpOe}juUN1hQ)K$J@eNEut1 z#Xrc%hvgae$RXp$aOS2-(ip$Wv0D!0_=teS-~o^PKMDJH8#0$zu3{|M^LM0x5;M_ zUOorN6!!`%uJn&@rMAzQjCmh;#_1~?#J5%7l@Q;~4m$aHKM|}d9%95YF`d0BcHER7 zREe}?=^y_8C^e*4EUOXuT2cl%XJuNJH-m%GlV#$&!I_yX{$m<^kqx@wkcDiu##DNc zV6b@gt@0$+>BLvxpP{mofwrrTs`qazNggoNtaL@wB#ZrMYy`ozIEfH8)f8ddfT>b5ZTDOAW~%%Wfge8HwkM@y;{HKUGqx`` z%X}>OoW?%$07wn~(Dvc{|I+s9CoKHG*Y+KPsUdgK&oixNr8fDYeGtls5Fz;h5faV8 z8$zDIC}-k}>zinF*X zi}$Zjzh8O+x9?@rnxf~^i@r#;Kf$aNB4ziXD>$uq-eHXdpEPfRe_tyUjq_mcw54@M znMS+L`RMA4QmqMo);iRAKu;D;&lwKPDB1gjbm4mE*mOUMW5&?Lze&$BPDZa{NBeo$ z4>B*p<1J>CBQ~inXD!0#huHpa`i%RJ!0#anGDFy}8Z*8C&s)|o^2l06Sg9$7Dbh<+ zmf7wq`C3KGm{+<&{JyOZxx_JJ&ji=(qijVj+ZH^?9*DB`0_S#aqN=aW{YEd%$5gJi z)R>U|sbaI0jDN+l`2KQQ#}n;L~m&-w&x1DUDxL<5m=J)2siU)TnqrK>!y zOoj;GeB^~O|4sq}Yn|Pv$pMFk$n~|}`mC+hew(D>3e&mGl97|p zTBG@vExFj6%lX?v_c$(_OwuU<-AYPB`O>D?ai$@rrYmdpKG@F8BqCg5@Ceu{;02Di zf7QBZ1kr+xCfBbQ5h&tdFMxSONm~&KA0KMIvDChsyE!_s=JrH1e68mu+Q<+bDNB880?RbX&Y@ zUcU7V;94@u!WJy&pjHc);Uank4l?ud98Puean^5@^YbYU%k z)|XIL*CaRKG__p5y>aMJxWXE`r6>YaBn7Jv9chj;xutY<7wM*k-+0E71BnH%6ntls^BruTogVN0Fiw!u{ z6b_UpR=C^>Q|Ee+BVzB4Zh0HFo_LO0Z;RCI_)MM9j~|Ejyd1wuAFqICR`kBB1=Tpi zh90f<14>l0!GUOmD|woxT$FIdw>LR60!<1_Yb$z1W!9SGgJSPF<+k8z6=FrK(~Nz^ zL*uHHP0F8>G(HasWA=b6>H7A^kyC_LgQKBy>2BLAL4)sZcLxX15K`ChYo*EPX0z^$ za_GPs=%!~u7wGDDq8ek7At)>R{A`$(Y!+WC+^(~)h@WylL5_Y^QnEdOJj4dd9>9z zCR6h5&(aJaSG~1^W`si7~5%}89gX{RcN+4NH zG)r1T7BA*iSU~N2+EskEjM@Mqj(%vU@|(}XpULDqoQsHdoKkqNNFpRF;~F(N3|N1*D3$9s+-T{G25 zv*{;#UB}PH&8YHw51$@&j1;JIoB}e+8UBE+MR`yy>Z2rvSY;yNrfiUyCwh(g9T5p| ze3_#D@{&&d`aph7ijen6wbCp`AZZbf@cWFG_yggx!u2^u*F=Qkm-b1 zg~*uKe43&~uKyC)|8IieCV{g}@gFXp57C2S$FzqzKF#sQJ{28GJS`3?Up~IBpSGfT z^}5$0WkE6%HV)f?bx#T)TS^j;&Lr&>V~UkVl!W392RfKeCRhT zJsNSsP4`h?X-v#1p>0&Lw_Y4x^u-k;)kWS-yf9UOVDL7gpi}rwllfHnev{8RVB47a zr|L$R$g)ZpOAAE(J~g;lro7&FyKbU&mim3Gw@8}(jwe(Pv9k#I>~K1De-K-J*yNJy zw+2ortxDq=68K;p@7<1GT*tFuA)?SKIrX#6JT@E~;wk&!YL_Wl*Ni_RBTS=DE=C$s z=BGF8zR_N|`0!6QCH*YF?$0B>jBjV%hH0*^S#UXL-@AJsL-5W&K0J#OOzTZfmrrk^ zc&v@3ZW*c-z1bEsM3gu2Q@Zs^U)5C)Z#6#S^Ztc&!4$26SrbY@)@scd!{;Zuvl-Qp zOaCn#h#}<+_4PwO?VmDOQC_I{BCh8}mCaGN+*;SL!iF!UF1jl4yVn_Ieub-BpLfpH zoB%h`#T#pLsJA~U6G8Ve8s56x1mUVM5l079DY-qj+NY%#rB(j4SGw;eEpa3>%u!~E zjHe|wQ?);9dlLbLu@{Ge&> zMeSc3Oen`ug|sLhPm=0T*vnEK5oVWGH|8;D5y#CQ`OVAnr}mMtb7P~gw$FzvO(dEK z5?G-@R4%e{c=74 z#A{T?mN=$0q&sz3xNCTgd**?RnhN!7toq#NrHKbW9#tl+3qPr!KFkVfDu^2VefuHY zdTx26hl8olq3Qxh!AMxOnd(EVL70P&SkORxsAHxJJky+M`#SN6dsy3YGo#)Q$-cm; z0pkkLyn{TfBly_r`-kU5a|N!nI@fUQ3dL*swCGZdAci_U4ouQo_?zwRt+-)7r_)m=j0YX=6D2anZWv zOi)H8TuSL$tgy^lW89?dCoBzCDiL}u>6BntrMd;rZ0O&rr5gzqwzthn4s${Zt=pZe1R2x6Jm|dEw zvGJc}8+6RjNVQUv~8(FT46JuoFOYt|UcPSwT(oovvgap!4y+5tksH%)P9BMJ~w& z#xGAAREC+_M;^@9M#yv6X>P>!-H>eoKEl9z!vv?iSc!0C$(*iZ36TaBnoJ1!mr99g z5+!EyFPjv`MH-B{mtq~GUa_@m;l>kQZ-*RBOkJ*53s|1e)OjJf9HH_e`>fES+?Ue{ z@p!hC5fqQR6;48dqbwSFh+oz4i^jZ(A4N)5*`m!^JKO)2=CmVorglCer0R8U5&F19 zU_v)@+Q=b>Qd~RhOk?!E+(6yVOJwHk&%1N859^IJe!#omN9N(!%-NW$ldeTA*XKDY zU(z`nI_7WhP)S(j_pL#>c2C7Mp&4_cZHfNY8ql^%+$t@(0g8G3Ro>k;tt8ksA#{e| zm*NH-5~Wx&E)jdu4bwyfYw<7pAT<}D>Kls74%(R)-f=RN9y%>&Ia4~gg-4v7Q+6Fv z_SN*vW%6r}uh#aL2G>lii)%r}`kgfEItNN=*3r%Kr`z>3hx4<{IJ$)j@8pQzzoOvc zzvdSwg;f>4(6}zqXuPFM=en^rPkSQKO_hC{w1IxVK%&KZR6FqaH}t(>mRzN#OO)c$ zh2*A2FHDm(k;ynXc>8}(L=3@XXQT+f;jW5~Kz>1*ad%eIyY~-Jc|oSmj;O%Jx7)nB z=X{PU+BJTWOL8}1B&VqrR|O*<0@2nTjeFNpeoiV^aiOrw%NEPMj!i*RMVdSXzNov?l&f?3TGl|f@|zO%35XZLN^M&HReAjNz=Ch86bI~&XU zcy-d%ICO1}k6wf*xUEUZF^`G<^WX+A(+Xt-<~EcY7#5KxbjMu|`N31--(xEJXjz5V zkXzjEt4(+TxOfYWG-e7puWN0$HyEoIkT^_6Aa-E*AljEZO#n{YN+xkDq>ZT(m&=nv zyWKfi%h1wbMVm!GKSD{?z)T6>;Y((t$UgFIN6d$vADK!%`rz{P2Kohz1(OKq$wty4|GS+ zxr}>M`pmoDHq>q8=9z`+-gQrT>+>-`#X{xb6M_)tqA9BC`Mm9b+ckTx-BE`*qDxsD zO^y>*4S}HGP5K+&1+3j>jr)jSyC$*jJ@S|G@D5T2DjP!sipih%+>3_qezx6K}_zW#*M&_!r%t)bNEhP|6#u@`M>I%6 zxC7!Q!%Dzg?*>u@Iez&!zv;zK%lnUHOgPv4_=Nvl8SX4V9yAG+B1QS1V)=i|=70H2 zLBx%5j&>PXO8@oD`EEB-^-?jHu zkBGYSFc7?|bgXJX56}ps`Y;0+6Ei@UB{wRoPK0QKEdY3I)f35(S3vd5ykWK$W&^6b zO!Zh64 zzxz2x4(BpI5Fx6!B7U1+y^graBPIHxRp5`faOpYbD*@Wcnm+dF{Or76fTL~=EHHw{ zW(~qV%mC@brdp-hIzokFbOToIOtTm2ENF!B`%p#U2VqIoN(E}?&`6}Ze4{#P{_|G8 z>!V@zJSm?zi`tw4`jLFqG7*wwHsc|REFF51H3VfnQz}6%Ylki9Kfb$y`WQx21W!Hm z5$_De&y=?h?SK~SasX%!pNNw@9`QM}91^2GkGk%5Lr`tFLm|>-$3Mz=MTHo|?sM*n zmHTPifu87?u1KS1h~^PxVLs^5EkNDFw2^j4iw*@og&52NJySGheERAJW4`%xWubHu zzm#ULA-;Lunj%5G!E}Yi?9}o0^tixqX_bDh?fs7fiQ9nU%%gn{*zhtufiiUec}*)) z2|mc6mGO2aiE*1x!=qN$tx4wXzQAhx#7AAOZ2TI|wC3$!yc@ z^`|65v1vfxsw&j1vz4064)2o5a5~+Qidlq2VA><*yZv!jxPt68GD>D&7oriT0o-m}~k=3{e0KRmHq@^GkCL*5NgkL+mgnS<(n2kcK9R%rhbp?SC>newD{4}%K_ z=f>DaV5bHHA@>ynLd+I3oE%`*9?$EmHSCFWh!Wb4cihM+bU6Ig=#&EdpK7HN zxNCsJU+;Jdqal#1kl2B9cYJ~p#%hBgyd(POf|B-U79bPdqC1?Xgz)vn@#fmfKEnJJ zq$|#t14lb4)i)2&pKpOM6Wb}{LU2z?slEd|g3y4q<-SED14K0 z`31IQdQE@xMoqA|YhuK5{pR_2g{gAV_8NE*pAliDuI%1xxs3r;CZpZEek`~Vaz_p6 zr3PK00Vd(h>+4xb{n7YR^#HMu8n}z=LXCInk`Ao`?>Wwd0?}P@&Dzga-LEp%;u90N z+4(w9wwJ+JX1PgXgfGE7ZSzH8=Pe>H;MQWuE_&)5csb4jMb^?BSlZPeQXNnj<#nQv zzK3@i0RhM!Y<=ziwPdONzTT}1zWJUjdADswl6twTiTwO9uqz6@ofMw0@bU{1qq3Sk zz@3U*{pUsTHwRA$&CeAd?X9BTc*U5UMwd%&P+c!uxxUtlX-LMskI_l}3e+k|+i#d4 z$9jXcF(u|og&OOH^#|P(Q69kCuBi^B$eWm@nIFqp0|<+e(X*`$UwaED{#%}8BJZRo zh$$Rgx&_i}7;xZJWlQ+90!|;x0p3H&MnylhO#Gi4zeMi}5m{902I`OOUn2rW&qEQy z2mY^y;Gn+|J_O;hGHIGIn}o)~?o516C*@rMDYf^w5uVRl6w@_5D*HaZ84^(!7T&S1 zP4O7yiYFdZ$76iR;%bx|csq3XeA{fS6GWk|7gjH!2q=$_14J#MqTQ7$tP+5hGFkYG zv>P933L!Sbz*1!mzv~u&;6X|fQ(8HRV!KUpg;Bp>UXm&i@D?-f=7JY&Q9u@_=1Uj+`ox`N87?xLc# zTabXPI#TIa7*key0THa$5O$YN;IZphP4KYP22ab#0aPHG!-Jk=)oLLjmx}Bj?u@l~mgv?vW zx_u4hi9c~H*$#UerUA8r*^sv7P$Wei)oTN(xa`k>L?%Zn)rA0LYRY;4scAi6WfVkv z3jh+_Kx&fppDCX6@8)I2Ji^Mq)k4cs=>^+xkl(6WhK_M8>s2z8_cPXQg<{52;2Vz0 zdsAKZe?=9PYq*#jWHBU{D=uto$%mxr*s7XWj9$?j@(i8EPYsF^VkzQmr7vU&32%c( z!J@mqhIPq^>dHR>ijh=7meCyG`cEi$X=3iZxn`n#tS*~?^IvMY_e{km7au0E?%vN8 z6J*jG;NHyt9pVZUxiw%?-YiX57i?HC_@sv{Z6A@S4vqO)-3YP}z8@0nz>O#eT z4B64rAyA7Cbf#h#9J!KjP)TXl()0`fPX}U#F;rvZ3YEuYL^O1x>0)7K-v%U83#Ueq z16k+|p~!=%IBVB!#=a&)@)v)MoU37R&EQ1r_6CG8!uk! zV4R4(fh8&YOe5* zh1(1DY}h;)Ku;tW0_o^bqKrsEJ0OF=%?j!78lk%yW zo)|r8b%QF%<8RLCf3_X#$TxARq! z;?Q|7+S_?`ExU0nw_-7-QZTX^aGCwUIF(Rr;1ck5Fh_5JuI1hGGg*v2P<(|k z7x7DeiE+bY(uy~UYVMV>8pw5y*a1V?!KLb*59YelmFCxnoILg-v;M27GbP*pBpY>Y zen=rLI;3$$yYXAQ)aseK1o7$XuU?IKl(1SRTG?T{BoDk zTUWDEb(|rV)HV&lQsP41ngMQtH)+Y3F3rc_bvxxTZg5MnmIuO4@d{)HL`IG!H1M8) z#7^gz&#N}Od07Bgx>|U!{uOE~pw!~_Rbe*pUW3WOm4QacH<)1>Zg|C8%J?iv99o15 zn$-I?+|gNA^SnJmwM86YpiviNXrWpnYdsdBl?_cd$3>NKh1yNba>fj!|V!?Jcg5U-^UQivj%Cw5r~PGh`x^_BaKmojBJ#J zQV>Ys&Q|A)i~2}l+YtNFsq)SLqA&mFv*an{-%ABkMR$To-O29bB;zPISZo=ENdoyifBNG*!I%Lp)dnQ%QyRU)Le0J=<#(UGnzjoJIFu}KYK zD7_E`A!+bDL4m;y2jiZvGLQS5eKM&B!=Mo}^Or|waO*_E z3{%3^9O%3El=H=jQ}@`Q%xu~(I5Z5F?c zj$8Z63>SZHpsgA6^6s7oF*(&~46kzT;=M|bI<2Y2EIpAh-UY4l(dh40r-4MiYy0^| zmjV;~kE*3HV3-QS;h|r>ktebm7n!NoJI#td7H{aU4)&AnZ<3nf3W|ePs0iv8!mRo7 z%r0vQ)H1m$3Up%9g?}(A`So$8CsUz5Q_$%Hv0N64ID85ZDgmcWc++lui1HbeVJp(t z{J4R1nCeM`Ue)GcnkY_u`fdm^D&EVJUyPwZZ_VhI~ttr|Dj|Tj6e-}lhe&~=P9YX-u+mPPtWUO z4Fd;-dOXq@wbZICw#SuEEr74OTB21uA)FO<&jz>vobBiwu_(qwl+*V~%%xw-_FgRR z`)vAXZFn#!)BI|)oTx~wZGT+%v&6BWp*-eQe&UvHPDhlccH=sjG6MT5JVieHy#*uQ z`S(*NJ9WGKc3^mPnNN%mH6u(_%G}(A{qfG}^{lqRw4|`m`B1#!_Fz;~z2}byY_i$A zQmFjK4up%#_M35ld0bb66$!)Rq>P%v8jZTl<&r`%|E`G)N6^_(SAiz~oOV=k8uVQ8 zc^4jsEn5yJ^-R^W?zlJv+4SPJ8puja#wE4Z6>bp2BtUVEG8w4?5p-kG_#EWQG8!|r zv02b^zeWMvko1&NSsDxzfQK*;{CsBXr5x0&2#20M{Fk!#FUR*CcUs&Ro^lb1_D_l2 zDUcRXYc1=lqke91w{-(i#mBT9Z)_?_OMp4-&VsWF%kYg%NN8=1FP*N4k9hs(z6h@t zq;kK<@n3m@EVa(`91)2F_`=I4rJ9-z5oPIof3_oq7ETJ`Vu%1Q-^Zbq91aHt^WL>u z+`!W)4SzRMGw0`13Uel30PHsjb;uU38W)vOY%34Ddzq^u!@8>z1^Q# zxl`gr%V#HwGb-OnL9U`eJCH5@76ERRX%U;J$N6)*J|tJx$U*!-Bg8dckcWKYlA@C^ z#fb8;3=2QLyo)MsVo8mN_WcnsyRbuWsYW6)Ji<|6$^;*o6*!XqdvPgq`H6C9mq;C8 z(0rIGz>-C?%3{`J>jUBjt}ph_0A0VxW&)tr4ch0BmwoJ`RcWhtI824*t0x$VFa%`B zksCUoNaQIMq~a%bv&fmma9Zgpc-7YQFmAASu*hfAC0j?9cPRHC%KWYe_C}Mkcv00m zqoIyCCep`%bm8isi$L41tC2{b131VCUP9hh9@Sb_xznT*cx>j1!!F^o!b&Hf zv8psSfqQw}VCr0hljt)0P=k{lQGxQ?O-Bn*K1r5qI~L|aRElLnxL8Z@>7 zvg%fJK0g}UK3e+aQOD!+D+Bba*iv?~qs8VpRP zU;p#Ql>)_q^-wXmv4q^`fz_21lU4yQ0GZe~Mz$j&0ClE%eK_k<%rF`OUoiN21HP;W z!q|m~#o#@gG&zD^mEa)>u$k`x=iMRLjo;(BVJ7Vdmd&+Xn^P5eJm5#Q2v@MoHMzQu z0{}nUg2eMr+({mNR5;$yb8xfrZX79H1xr_X?W2wZO z>sn=Qfq-Z?d<{$<0&^Rd{OC0N@fcFqFW9g?3QSpbkoZqkfUzN$T9FJ!Hc&PFs7GK? zvwk_IC0o)2**2ht=R$Ve26%rzlIv)8DgF+i*umAOef2z^na?`ERh{56n=H+cFE&*P zpOlJLdL@CPtL+=g=s;7LzO$0 zDc{+qy&-_#@pc zX}yq*4bfRNn(soYI3vPw5dcyh6SR74MaH zN}!yqYKN0jmOwgc9d`gIui8k9_J?;O?|jdoNX@2<QYy_PRX`pvtKU=^7QG0u~>BOfN?xok@F#QFFDRG4L^Y@)82 zfB=f2z9@ArJvWI`!_h3!Inz8Yw?zPUemy0dnN!@yW^V8)ZC4cLTKfl;r&a^a_2%b= zmq$bMi^^A~Dz@s9e>c1N$H+s_3dul*w%@k`BirOIxEz`Fy*VCv>V?a`crx^g0+uvX z29@T3U)jg4-VE*`w|_vS={qLomZ%h604Oj95imE;+AjwZP8&$ifY>XkAG8ut zjIa&cJa+%mQ56}P-v{7X)B?P=L}%EMM*aU`?=J(YO1uAYSWpa9L=?)c? zZje^-(4fQtX;DF?R6<0$JES=@NFC{}gD4Zhg)l55=qjno#qz_g`{@IH3FLZ~;}Hy;0@! z+1GPoZM;@XwYT0ZWlgSut(;V!d(~A-|DzqIf9d4Bl2w4NvMlVJ!^jMV9A=`n)W;tb zO8BsaZ3R)EpUkvIi;Lgg=IZAD41pIdTBuJb8jr2FYWGn63KplC5~^-5VY5VrcV|&= zS`Bs}RmJ3Nd_nwV(Q0*X(Jj5mH(IkR`S(87FfBzC=FO;$nO_;mgeV$vh5A#dZ1a(_ z5<^WUJ55WX=BxiGu>bO<=NUo;C^+;>?|!l1S&ydNV6}4VCQ=o|NMbvVKDHvSN_)eN zlbBZHMe;+jJnlWcD-|;?N}iXnWrcw0<{Hbh*jr-b5}rTw?q(0?qJLFtnNz)Ag{tIX zG!|zJ4YeD;?fX#9wT9qqU49E)-2CHg(8#8d-=(tT5rT-aw1;RNYx_^qWd&_h*& z*JahR5{EE-Ea@4a0Ot-%i`uAAF%qv(-L3=z?1TOgxt3nXDCRudO23uMJY7(u=(rfx zs`kWBfH)wN@F(FaHd+O*`APH~_M;Xqq<;k3h2VTQw-3c{X$*CC=W(UUxO^4s2Djkd zd|R4{wOh?pOG`_%c;#^#<|p1iEQ8=1 z-C27GWD92G<__|hdVbjY%%26_gEdsd0ij=xZm8!jp-7R@>BC6azC~xbifqYpr<^{* zIDh$AiyLnWGD=r<+1^-5e~YATIsnIK=)cZKH|J33=0<4%XX=P4jD$y+sTEYwmOB{S z|4U74ZE7?{PWNVp>%t%m9LU3l^UbV|+wN`wr zP(%mZ<3{Jt&0%o#ry4Z`SOasnBD9|6iDVdMX{2lJoKe5{CWM?-Qy_8;M$Ziz*O43( z94GC|Mae~%H)wuatK6LfaoadTz=dm=)PilRdqEdYm^0`Hfo0@=iDQU&e7HMRZKgTG z`zJ8h9#3yJSV~vKTxNv{58gQl((h_yGjJaPoO8ze2xZ{gh-cV?f0EUsq|{#_Rg=|j z6lEDX{ zrh;H)L?lvgju*=^X^WzSd0&!TP$`yL3>9X&?5s~(@zv^(y%u#TLF7-*YYxro&HjJ%@p2ng^D-r3Cj%*Lr$m zd3vl=QU_jc9MYlm(ePVG0Q7Lb-T}*(`**53vvOynBVQW&LdobBdLv_>XMZKDtw+k7 z<{`-07A7*p>w-$#k-57S5{>WxrOcN;IV0?lF$4M%X!UAcsR6!^bb&iJ3>R!@)Ye_z zj)0VWA6)H<>d#mdBlwQI$N=Sy{9Bdo+YurVF36DFEUat!)+-$3t5hQQbB>J&LM<70 zgbk@7rFG2$vd>Qb*5uUI|1skbbt!i3gU7=Dqi4n&#&(#sTr`Wa_M-15FM`K2$!I9>~TDa(wZ zAfJkh^OHRlbB8jJXBV2jZrXVHa6WcoWgWmixj>>E2Q{!_SX_5`%zP|^+$({Cba~}< zEW$@eGZUBPTG@+@`J6>M&P|ixD3tlNhVN1gr!rw@ zy34Z*M~ee6(tb(2aaH??PA=nn-bdQC3Q{&CakO?*za)&#U{dpacwQ4`bM<9tbBpV4 zWBVx{HQ{jHI9l4nw95B1vsANIGj?(+Kx|Nh1C2&u<1zHP z+-wWo*xwE?h1f<;e2u2 zEp(T9P$1$ck`&iG?#mMss#dkbzKlbXmES5Lys{m) z(dT0)%^RZJ5a-|OCsUm^Y0+r12dP=}xzW36*V+oM(YWypN)x0L&mQqz)si$AR3bVB z0-dgppU#Y(ib)`X@2UO_ID*~w6VX9smJuJ`<0(1>L^UZ)UoM9QEuYZl^j--p_SoH- zhuzjL+GEFQJP)c2%th8VNY0-$v}P54`tUeeMEss8V5Ukh)_NP9*ClNL^X_1 zRwuTKhKMD#C5P!?-9w__o<;*rR$aAkxrRfg;#h6Ap=X1t`$_CNz9KX4#U8L4x(sT) z-^sV1R*hzmJP@dVz81!ddJM&ROy= zF)JFusaLI-gP!UY=q=U!hjlKF{`d`8XJ6Ar;AaBrYhT&|U;=JPuJ0C>3y73SQRM5p z96|9cT7L-U*svjbGOc4~$uNy}fTLWhS_W%OM~AD0$5EhGn6sgoZ>R~3jmLoLr&vN% zO$?i;T5a3|27Nl>7?rA5SNOtQpY6hks(J3tM<|p?W8cB3~a58k|}ptbAYZqJ34xXrXYRtl}~eb zMPU!)R#bcWsr`(p3Zi*vOJFO*KW>x^20alysliAfTaxhl`Om7mcd*@)eOH7wo={e9 zhT*m3E!#cW?i0ua0rAXPF)ckCsr=cprB>g1&hll|HK@6yx?uL(lQv8OC9$=tmXT&K zx(N$V2o8KwR-MOs7+&P=@j^|t$12j0qpFpY7|E@KfyWm&2;WT+jzhR-^H0D%-@}r| zQ6Rdpic$dk;!y*1&X=+{aAen)KuvSacZEw|l1H;oKjI5IHco24f4Cspsn zS&lDa-$wT)TdxLxt&Ql`2tt{tYTaS^&Oz57|2qdwaOxB^x9ladC4*|)#8-!4Af=lc z@l#Sb4+j^`_4j-!vR6l4)9twaXulT9VB6huCJoBOD!W@RFkmJ#*o z@S2u>i5J0xv^Fdf!#h-Hc9cAZGAi_*w!k?DxwIoH|KpXtL%6HZ@zPSbR~jJllf}?0 zaRuCOkC7BBmga_Idd+m;<+XFu+^VMD4^cIv^etV7Z@)iyn-T>C>%irV$C3hkN|aY% z(x6m$2E`S$4aF-^NH)3cZm+q&;*uf%gK>5vqCv_McQqmsX1b?7m~yPcp}c+6pk;9{ z|3Y*4$iY0fGAKnlSPHXmeYukV<;hFc=bRa=efMDw;c1CbT%At8L&;!QN#Q6xk?%OM zBJq%Q#+XcY9(St%UEKZgm)GKK{FzLzQ8wXq4{;x4Hz+Hf!l|v9Vk&bV-Rw@hp|>Z1 zD=c+LYGy+Lc1MrzD*Y%*sRj9FALsfTD(%XrXJ1g768)J4V6?O{3r4yDuA{Z1^a@W1 z9%5C=POG00~o(9L#@82%77Hm$=sJ*N)(28v&>vB z3~~M4PWOLaD=(tP81??Ayre141n9Ms`-@LEf~r&x`U_WM(r^j~$%Og$Wbba_Ci~kl z-#Yjpbq5fb7hD$ot5}f)%j<1on;9DV3*Pv}Yn-)tz7I8mFEJc3Z8W&nh4uRnv%J~x zC$r3><#-o6b$tJ&e%qNH#V%}bKIp-v^*@0w;6;j@5c++vzN~1)?1rrZcoA3+zQeSr zl0MA6I>9nwx&Uw3T9{V1_q_kblQ*&)sFuJpslB;6M;A~mZvtgZ#6rzQoz!!FQ(|v( zUDs`?*af-^QXZVbxttkXQquND7|Ri0#{GouoZQckuh1~28&Z_C!JguGUY_y;zv$z( zqeS8i?AZBW{hImnA>&#E(!X~~FO2-(nz;Y|zcQ#j^bUgbNm+tG)`p?^d|W6V|C#xS$iYVzp9=d# zGIv3NwO&VjUA0e|c%O2SFeoVA?y>>R?Kox!>Y&eQNs6B*;V1gGW+T=2D94^~#`0&e z>0`PNFx3%VWj=`Yd3aX6#t9pfQ1ztPkG3R$uFutiaXRPhsNfg9KU% zA{zh&KOmaYLIFAfWY{33e#r{dBnM7nmBMB~$W3Dv>Ht(F*_WWC$CF~#E=0y&Qu;a6 z_o@hp_6p?feN`jGW00b+YZw-dlu9&v5=|1gq^IMLXka6j+dOuQwgBCI!QpXd%2Gh@pBe+ZzBGm%H%wyvN;n^hq_AY)a~1t>dzb}Cd=D?>Jm#Mq$6sxWT`EX?_BG2 z=2)9c#n}51jy`+AR+q`vNVgT@)y5vT-~z*dRrOHRGnGB{aExSbq@AtTSF>3<(m?ph zHZy2Or)X~Gc8WYolk84VA=9H!!si+G)17tFI5fI8x9oN)H- zp7S&Fm&|yvHLnstnMg;h*qL_+#Hu@C+07=mC@gK-@^T_C-eilQ;THq+FLb-isy8ES z^XqeuS)^~_CaAZfXw74je-oN8mqSwjvpeT$eyVXRZ`J(OX#a5u+GGiEKNy92;T~$A zq9?M-2Vr)0_B<}|RCxxHD6?hktnGo&yP>Pj&$60jbxu_zDCu|z_8ogJp25GYN?}DBsK|e#Q9oh;PZf|7}OUW^d!Q z?82jaqiscnyCb7=&w1&$k`GxAI}=}3ckA`i8y1>)lguH?PAiMcCL4)g0VJ=U30xl+$^F^;c zgqH|E`VQQ)(IfT5G{4P-N`1gI?IV~Jh8H|`T^vDee06|xS=9h{zAofq&@~1KX{p}# zap%SZ%}WH=KRVcq-J-39rGK)>XJJw1aCeWcpxodX>xu}y=mI`VMv79k)7~j0U$lVt z_Pc{ehvAj5Gx5BIQLH^8;O@n>W0ZC4Lg?)@B@Aw^=)-&zu%A;#p44^u8nZ&T9$eFt zz(OOSVW~l`tE7+7R+c}luHH(0@KHd`*Bt-LVO}+8U&^9M?}upfQYNMrS-pG6^yS;W z7FH>v@1Up|!9gLc!f(UH-aU8ob&K`t>_9=Ghq~YYIW+##g7av44th}a+?WvH*q3(w zCinmxaTs*E%{!3&uPoJM+0_S7Q)N0r@`?0VVnFj7-t?Q~u;P4s%|JN8W1vfiPht_K zO$2bM!`B-1jJZHQLf}%`W!x}|0v&pXQl=ZN{{03`Ix*Rgz-cqfbhr&kI1fBlsqA8N zR1f6*9kWQx@)^#x#y|!+*APs7U~s|^>Oj>UB}OyhKFr;Ub>sXLFx_+%`y;+L>6xB( zOcVmn$qi;-yuSoJ-BUbqG-V0qUS1*l7q&rxiRd^>;2pIzeFJkIBWk<`zY4DA(gx*Q?7Kq^5ST6NcBQ@02z#%sPbFy+Z~3%m}# z^~v&KpAd8+d8oi*5?XXJ_Pm!!=Z2{3k5Y2IGhY9Q7y*)LgZ3U1xoA|uHU5}03l8Qt zZ$%<X_M75kF;2N`vd*y{Fp%gP`XBiW zp%(PtNx#GhM-;oNN|9E)v_A_zN6w5hi-2gN9D*C%UvfDj6CiEULqzU^m|cY$<7jWl zM4GL9(O_^cuHO9C)k!R&=<2YY(pp#a4v+Z+Ol7%;i20Ldx*DNOpq^`mk@u(lO`anI zj}$$%5O+qNgkomAr9C3*Ubx*{c<9lm8fzSK5F~SS)F5Z7bkMDfYCPO^PXn4-rNHPD z`S4tAoV+=b{ihw3>`TEJkk;#T>^&lbWIMDY@ zX&a$rQ`8-npB^EecL>uj=tdR*kEBBsEG6SfK`io?q9C_jZV}kn^N*wm_9MAR5nNwV z(l4MX66b1GS3;M!XH6~iQdLdE`;`8O(d^>0+{fyBR2vCKgs->KMOuj-={rdnuF zanMixHs`d^m8@jX?oD9l6(DNmvW_pG&3!)%hzNc!8(DLEwxPJkG1G8FOh0F#r&1+z z+CFad2~vn!Yb4Lg6VG-yfvDL(@?k@U=@!J=ylwKkH8{t`lInE`#}8VlSJ~RNhR~f* zwms1?%^hbUm(Pdiv$;JBz*M-1lWnU|4@xI?4;PxB&!jaXg@mXnC0zcnvpQ*<81@yJ zLv_+LxK^ZxLWH;wo;E|n{EB|$^qUrI?oFr2Bn8xCHLZ}1b2k7AGS3gf8NcDnE$Z0L z1w#}D(W!cLuNZ$5O<1X=eLFkHOT9ndYoPK|FY4S+;8Zx}%fOQMN&mI)9zuXcEx}t+ z+ji*DZ8CWqs74N0?3k$IJ}=J>botYWi|HiI)r8jkm#WRK4IM?OT9)VkPSvt%Wi=b< z^_ro4b7NxKCmT8s#p}2u0?CmMZ#JR+jPI#k4fe;wz^rGlyy$H6ZlAN6 zt8ZbQ0JiW+{^T~`3QJFtBnn}9e;~;ly=dhUixRLtY5;*8e0b>m>H9Ef3vp_LwGLuI z>0nQ0L&1q)wFt~O)>_;}s5F3nXaId1eMf0%>)z?YyE6^?Y9B>#yZpoVuGq7EPQcV` ziVR<{%*(20f1RkEXKWp$!1oO2zrHkSBJw_0tMih=8=fXn2vr;K9TwmDdJGZsukZOy z_w;^7W~Sv4mxTf;6s0@dtBTf~($*XWqSu2qCj6$8!Y&mGCY&A{)5+YH8PvE2Ar9RS z-kM6QIsG%YKh}7OJ?Jhhnvw4^Hv(Qj@X=;|K&}Dr-4VU`tYMt#hV1%}rA`3DrJV4* zAYLYsiR1q!NW0RvzL9b`!Pfp!a6F|k-2;g*qTtBuoasS#4h+J`n_+~!-5dP5HFCxP zHuW->9`MvfJIQF@?p}F8=)Q=ac~z9Fb0dXf7Tcb%XYXNI6$%aFtj^p)4m_cf{v~D? zU!s7JycUcQygvFPuiPe)KzJnn>-IzXIfkr=qbCkof*56oGS)(9HjqPwfm~0U;3@G5 z_4b<%`%7kV?RN469N6Ik#B1XxGcaSU#3~@IFU)d7t@;YFYuZ(nQw(loJPFnV*q%te z;srn@CA+1wMw9Pt+xj`q_IH>F>Q=axh$p$?FxoEoETw91V&gMH_9-pKKNjpDc)EZm zCZH?Uw`jj(Q%>!{?#u_dry1Ct*CWP>^0^GAx?adS@TCk6uN7yo77Rq;v)+MLV*x;w z)h8L2!e-zNunAmsHQG|MAjEbiDP3X4f3^T$)e(R)B74I3=FEY`xr-&Lau)jKF1?8g*Kg#WAN7)`v2{_C4JW|<5YPv-jDTKB~bj4|V z-5@Ya4=fvMR_~=J$WMcmUl)c#MqrZSXqC+fP#o$0v#1a;l0jBBiC1kk(ixsVDzMvhaQwrm`kyb4BIm8QO;o=e7=?qXfO8v$ zc&4DVLICqN=MW|UT$bcTwi8(OmF_kFGn-#~96eT#{S+)l`VCV{!hKAXLY-nU$|+%{}XkCn>|>58DIEHV*5KWst06)`D`=b`m-QisW`JV z)8z*gTI)h93}c!CicdY-!QTy`dC!zTh^mII$EMc%s2zwGO_v+q6MUvHZ4_4;7w3BQ zW7bAl3_d5;L`LBtZF|)|AvAs*=7GLHW0O85PVJj-7L)=#~&u^RkI-~4 zPQ3SRo$vcbhF?3*+a1k_FJf_d^5?Sq&kOR$a%k`0hu;nVR)3)t7W5uM6e*I%q2;ZU zLWr!q7_q}=p+ZGOi_4_~w&RT(W$Ck2KmP2HSD*!-zD$G-WITud^kiChCdgRN_P@7KTCdHkJkK*^La4sZUx1rr`jw%x~e?Y8!42<96Hs6iRF!2e%ej z3+(zJNYk6?&BS45vSt7-vq)ZP@TZ!Dd8-CSrHK6%<^+n{0%=tEp>mcSfN{fb&DXQb z7JXO?Tq2-mZU>6rq2d|f;#UHLPD;**>~1a1L-;%ze7DI%2(byi1RM;7 zI)QS>_UCvi(67kbxvkFhwM}mz65?at9Jt7vJ08bp=H=hn36Qquonk5VLWg zNhX@pY@V(_3vV}#u$9nHtn^hsp>dx=a8!N>p#7qJyCF!7@iy!Ft)zLgkcGoBl!J$eX~GAeI56=MWuuGP%kd%3SFsATm81VPqi6KCCxQ zgLU`3A?0s~(s2hskgGF8jff}IaDFJ?o48qp3gmS`L%@I!LrUOjj@4>;{2K*NW#Skt z$Dju}kzVkbn1q;c@Z__V9x6htR7%AXlGGE#%0fU6xxP@oQCZJE#R`>C^4IP{;v>@o zdC3}MPe4{~gIH|Ak!=g!P(^Vk@bB@OJW!K1TCqRx zpZxHBpg}agQ)Y;XwB3iWybun~qCxZY5|C-E4}d2%_SoZ~a@oPT{JG4c5j;!@rSdl> zBbox+S6@t)uU?;h;N&~#7H9U7_9$oVVGw1UJ!1)Pmkfcd*}@RGqgjOxIPd7+t0to4 z!z6(+z*YQiQy_!&XqBhCh?NUv2>**`^piLe<`#%saZ6a0b>d*ARj&5bTs}K9T#|RJ zJKxEsmZ$Amv9tY$X%{)EY9IetaOmJam1zwMwrt0^`IU;TP?v-7rUb^2*3ozq4#5t* zOqnS9qk_Rxk5Ivm{U)OGdchu3b@s5L4#qy!1)z8w;klB@a~GMXLFuU~4b59S_6xfq)LOcx zbaLg^1m{3Y_iK*Nf?wg~ml|Sz&7Q@?{WJt32>VUbZVhg(!kYiuS9wOmI|w!duK4Zr zEDTm@#tG#q`{rd~xgVCQ|LUbl{U_@9B?fl6&dE-ob|{D#r+2!z??EY6;iP}KL{>2R zBzDJS0$9Gn4(Us5tIo_TH-nv5NEZT4E7JfXL+MK3r+Y5M6h@O&xPmuWY6U zajXC;5!6HTfL-M&d_J4Bl5Z)Jc0)U-yC9mJC?#c+6Wx|L%V(oMZ-zt8G+LgadBhz^ zaE@4~%RTjQvA1TY^DEm3e2oGwHnsFh>3nc?V{V8jOjxdPMl~bjoZ^!@E!PmV*#8VK z)~ou8W-K8IdJ8*}z>!^3Xh}_VYAK?96|;sGMWBay7=9dNb?zuSGIi<(o5UrZ*Y^^F zH&|$RDMoAf0f))b<$L)sAwwkPq3d2#$gx5=1S6CY0aVZ2& z26AE%xk>e z>C1<4F@V{*#YQkGsXWK&a zdu5{JUX{yP)uHO4_L#JtmlljV323W7X2BT=BgswTEmWPgi|Za!v>lPNVx3+!TX=G@ zk>52yXY=T#E3tFJSAMv6*Rw+tHYQeimYNVn4#ST*iF+w3`g`87`vHhj7ze#ye%jZ0-+S_kzC|555h^7E@~Ulo^@drm@QcJ(Y_I0MdOBCjeA`27ddHv3vBgee5 z+Adzt>UX>K!&8l5AIonqcG-;>Rv+*|m0W$$6;y#lYrM@_am|b$O#qGJ*gR?6tCCa0 zoE-6~)GsfFn76o9Xr>qm7wPLs*UEmlSw*~i{Q4NP1uHTb%#z3G8xRIVO6jHP#kVP4 zKE2`{cS2smkuPF7m`|SvxTxwgPx80*_9<%@_8NM)iOWfv0+gNP0!R&_TG(#rWhaYK zZ?>z!J40o+Jq7bf@Xo8V(+$9hTKID3vPp5z+@mimoG_!zz~2kSD|G#FZ_eGjKTq6q zi@>8~K1{Dox<@x)@-fjY=xRT-yuK9QKF`-UQ3u9KfgJkDa&Q==(#>t()Mm2Nj|M5a zQZ7{nAaFm}wIu5HX=dJ#<9WP?@TfsmZUM}OLsE;7w4dxaAeCMra<=)zd>o=E`7YK_ z-&2bSp+mpVUGEu`!(8?}NlB&AbXBztnZz47fPk2-CJts`7>L%0Hrz5^uq#Ola!$i} z-*ZDI)3-k4>UImsWkQ7_vP2inS^G;t<*b=J zK80Er77hV3;07`ZJ~Jvs&CXc&Wkm{uDKPTkaSgW8!04yzjB{?mpDX470-ov1j{e{a zR_Z~~ukTK}>#fxw)3Ng$cOyd}^uQWdTrcZXp}r`#0R|zu`)Ie<`h`%`&T0!L`?@LK zAc7(yl%~gV9zBluHs4wvO^`}zn(v=wigjL-@GTSUI=uhuKrFX#t2)^33fW` za0myI$^#J{jX%YirdH<{qWY$rwrp(^f)Hz!;*3rY)d%MPIMEAN=(7iW(;b9DlO?cd zx84g`L`13n&^s1Nt5trh<1}w(?ejx&m{l}D^o*g=>J0`#?!`AW{DE#C?kF`_#XFhJ z4?!*eTiQHqnGTQkAN9ejkb)lM93^|(Ihig42Wo#5CU{bgVSx^P*EA8hD84gCOC1-^ z`7=^Btwkt{LZ{*oOg@?Iv278UMm2{C5n1@{_UaP73(S$QE_)r#2%8j;UzXhoM2gbf z?xzPp6H_bm6Ph=LA8@0daJA|H+IE3`Nq;a+$D}D$md}fqM%=48WQ_>e8(;iJ>AM3Z zoltqOD2Ru-KeNU^#<=8hq4DZeD_0KhQ9c!q7zsOaApoYBo-!EThf`MRZfd2(B3KI7 zJGcn+++Yv+5Rqp0b@t~8UtJf<*W#=T$WqV<>*O*}VrSwAXOwgoG~&8E21n`4L?%>` z<>(Jv=5Y-VDDN>O)IwLg@M$Wz?H)}hko)2)L6hc(UpbrF5;$I=aUld?ysm{-=;ue9 z@!;sT(R}{!lCZ<3K|8jB;KzuxVq`+Ig$Rh4*+LGh-r3vug1T&1$=cQv<7=2{(;}l z_U5&4Gla@{mSlAYBm#;{<+2V~Ky|qbs zeKR{lttFhFvoBWp*pJQ^fU95&RUKeTJ?L4_nRC0&Cyrzhq}OOr$lAjogy+3|zI{+7 z&xn9fuEW+pr8MZGDf1D<4Hz?odwQI1*q?vo^;`c7-BtmFak;*^%8zl)sY5B3rL@PvO<>|qm7F~ z+;)iB+YAi5i1)vZ+jM|-o-fSK^jzWP)%=#{RHy7d6j}}SBCf}~S$o)L+&MB9zGnfG zB0*!xF^`K?h&?mh8>5KPj$J9 zqE({=u}6C0{9dYc_Mo*dog#)TZZy;G?T)XbJfR(1Ct+){yR6@+U6}Nz!EZe+YY@na z1vEkAXZLn@ys(bv2NqnZqa(3^hRVgaT<27Oya~LJ&~KP-ucPV%c)1(%fQxoY%~HGg zUThbmf)ng;otC{Zh0}ut7WtJBrSYuZ&Oi85st{s1*VE9N*isdts#y^?-<5*Y}V7eI#-;JwuaS92^=I<}lz`fVs#vvCU;(G_{<;Z2rKUKM|u@*|q2A z&V0Lk?|X6agR?Z{7;+*!$*cMveC)IjL9{b`%V36h{TwLQ-L24q@$-+F4)R>}q~Q|4 ztj-My6PX}(#2r!)DU6UF^Kz4pp-2387z7qYG++!pwe>x8F%lDC0;xh;aby@yhiAI3 zE^1R%WmV=U!K5oPfEsVu^Yv3eFG4Fl3Prz9*vv4zqqiryPkMUOUIfdJA5IS_HL8m@ zCF(^kkf!YSL@rL?jVYDl6gU_y?(fQY?24 z|IbuduYGtS=_ec`#F4+f`Y~$sPex0^99nDDuD3toQ2x13kevYM&D8l!2EOl>CXZP|8XfVbYSsy+gtvWnEg9%0=JA!!mQ4R-2DR*)_Qrf z(e_8r{r`MG((BAQAamVduY{UEm^Zjk8=fA#{U+{h;11FrJf8D2BiY|S$H8$jQ*f;r zu7V4FTrZiAqFehxCwL6EZ010HP~ha+jhM9l@iYJarG&;|L?XZQ4RV>cg=|vt_9Qk1 zF_uCM5r)rdP~c%Oetfu#=~sUl9fbhYkKbQOfYJ~yXJ*`VhD}1|+9p`THy?p=D{p3e z)Qs%^-k$$fQ-A$0&w7@2U=)xEjNfXJ(QT2`#*zD(kRr(qdT%736G6GNfF?ojm#qD@ zYleHnbLzXD(Hg|%*8aS`_n>TW1ocx|-W7{Z40bf(zWSLoRQ_0aO&vYglj1wh{bU>$UJAZr=&DV+cj;O4 zZ~wpfRkG-}&SD!=bZf9@6$6a-&bVhF$6z1RK*wE>ZdwD`WQ))mZ7Fw_D1XfU{^N~& zAmQ$b)a{^|?`PL5ZU-QOZWl|gr;|+>%yNI22V|GO`hdwvO?5o^Ouw_4|JXNw@MW4h zYzHi&U(fRW@*fESr+VrMZ2nIb4S#*@Btnp&=|sk;TInBrivLqy#GF8=|I1;3H~r#| zq#*giY0K~I`hWFi9aO&z*DlM{wf~ndU4sQ0^6vpgzx)cwB5}fpP=mjE9TG9J;NVK@ zJlS^ZmyajqLvZaZvGP>^^U|wedY0m7dP-~kbMfB}S)8ajw<@J6`Ht|@LmwaF^E!^Y ztOaQ19?eXP%|3I(^xJc3nZ12|nK2cGI<6r;!ahtsKDmt*?k%+9!VJ*s4F|3Z-6ACE330D1b?4n$f1^7XtRMo!z z!{N;Ck$&&>$1xuxLT<+bH|q{bQhF5%$YBOnE1*A)nF3J|MALZEVf392`*m^z;0KVNvB7-m?ZxU-JVe;$PdM0}S0H_da)bR{Ix{xw|x zKliI!5N|BoUj#SXdsCBG&AeiWsWA$q7gCMNIUP#Fi- z;U#D_450U+1AaS@UAM3i;7L`KUBy!5!gUb3GVt?jtuB1v)Ge|OK~%<_&N|8N+Z%Ct z+Zm9v-n{?}G#MFLBSeq1l#2u3E}N~=(*%A0b3&OoGL*scW zjC$z?d-t^lKTtzGHv*1Ram9nOE3Lx=;!K9{5qkeq!1Cva1mM*1B{|I{$( zy(_$;JwH-jI^~v^n=4;wZ=4D&nx=C$?buDu(+f+t-_P)=P^=a7|G2D0cDXN{FH|<9 zP|tSFyZ_iNJn!$T_oFoOF_ZAw@86xS)uxu2kZ=|BjG`y2#syWJV+aHht)@CPd-96d z)|V?M@1M|CdY%nQ%uS7rFF+q510%WZXL{MwUVnyz_Y#Zvh#$??=z&W^Ns^bs!oof` z7+63Y@*?oOLs#-vhAJUiFT9GeO^QXJeKZ|n<*!8^{S;?}WKdkyw*YH57c9jTSI2q) zK^Pj&+W85;&#QD^F?RxnThvMN7>jvwdDQ-N4W3Q47Y-zpTvwe}@F>oO0drae*y2=* z+N5n1_~vFXyM4muZQsowgXzfI_Tu$}HQmm4SyEPiEXNcH36s{2QToUh=|`##bAu>Z zbJ7mTtcW*9Pj^Zo8dJtS*G(>ThP{~e6V+J&xa?UPMOZ91<-lk{T-^DAw?9?KtI<8u+G<$XWDd6+G#Lxdw zf$AstaA&2U-RGw}{dclU?=+nVgCOdb0E?LhsqAcVzJblbF`q&t3Rf^OPu~p^!p?b! zc0e*(-ng#R%J%ro>}9@byK(F6&ohg*f`WpDPfe&2BG1I!32MbA=6 z!lTzLx;h*qHU}5WOkH46Y~T7Hi=ucBHxq+(;~!?Tr*c))5eekbq+7o-?{agbm4f9g zwU?jA?A{Y$0huL`!HqTh1Vu+@YlkEOh|Ist{>8O^*UYoX?2?RoK zW-TR~=A+A#KUkqLN2rCm&01j`Mf}!E!hT((-B&ER>M&e=?ImPl<1%%N>h(x2UKbWd z~tBp$&2fY$xl<&7<SB%nVGo{N-Z@2M-3*nALe)IGiqK89#Ickhc(i?c&2!q-}3|hpOGg z1~!FGszENIJnid%$>o%yibVg7cL%X@=ohjSUCEe3JHmO%rR zF-HR$7^VseqFVJ5@=lgKvN%ylZc1sZMsSwCY4yKNo zs7XqCLdbOWIU(Imy_rCdkVLqzlx@C-q1eG@!JqDC1iOSQTUxPymfYXSQ%os&sh*CL zbl+=J?yhiisVtJ+T!c4q|7e14uA~TWPa#xNO>nZ0!)A7})bGuHW`t6`LY@#g|J9-= zEo9PpwQB!3wZOe9W1`YZw8@N{)y-RD5Czo9+Z&xT;7!CH0EylJ%WvPm=F`J z3GtfoLLD$IDQMCFFB__t2k(odLs73~5Uz4-Yb!rbyWF{;D+!nu!A;(`dGN`I@5075 zPZ?ytUfkFXNi^IRL7x`}3s@KP7sl?0a~2YWl=O31{`@5gMAYB5XG{Ko3^j-mDQOfJ zM4CLa6M>OCCLXZlfi*d02Wd)b&&WqR!2#2;0i;KT8Yi1@6FypBECW(+J0S`&AHM?- z#Jl~G1*RRBVG7_!Gec;*C&0e{0&<#$f@qdU!3_HnL{Cb{u&k1+;Hi2iI@1o^mdNCz zd^_OJZ=3LT?kR=YvVwix3?Bl-ckWMqgPalj)kpL6i)H)JhXytAKQAQTooe)nR6s`5 zAhjjEu7ruo8E#`g)0_S-)7hZk$HE&VS_D>`T-V*r4-}d41RRA?99@axeAFWM+vmN_ z!~J$r!y>D%J-R*Yc?q_MAkdc3TiOD3$-5VQZxtMwE&*bcG{F1 zZe0`IfgumYIG+{Mr_bsA)YKs%@t89BKJo8w<8{vm9KMMJY;T4MSYYJ@L876c@nM+V zO$0@#zn$Vt$)g~9?`nA1pN6Kj$$jCgiV_vHWTAx!KFS)w82I@&96_g#S z%BN)-Aq`(K#SLj`zS6IH!aNWSsmX9bMXBHZPz^F3^`7$cZ6KL37_K&-AEiedmfe4A zs;rVhmJkW$=_?SF`3hHIar}{C(;y`GM+++%l#RuNJQRUTyMG2t@bfp~G9hK6oK3Jt z`s?<@ls|osFA0V6{4N|-b_Q~?#T^9U@)w(6DK^`P?3#fIcWP1l%)VTs1`_R32P=`D z%W6r5Yf1+$o2Ycs98KwD*y0h$6jFsCE_$f&;Sg#OJs*lr$49{~BKMxxdicih`?nhC z=&P*lb z_m=#^%MSnK|Gzh`T-T7Kdpg8toyK1-jy^gjWQEuaCJ^1ZUGtv+BJB%n!mXV^Yr4xW z|F-l0`Tzb%P_3?`GwV)yX;%sOis0t#_k;01(J(~?fA9S+2~D! zmzIbjfG+QoST6=fEpSi3WOOz})bc4SoP_Af5z6yOf!Y;JVQ^m(Xx_22Puv)_$66OS zKRTZ<-?XF7>PRWgR9Kz7k~`9dpMTnJ_t#zaFK9WfJ8YkyihN4%Kx-eF3*$@eR|mdx z+$rZg?|*P~S)Te02Fcr>=_s}t(;<%u#_kk=gv4k%#=W!)lKiya-j=%Kpw|FcVGz`S zYK6tc0`>&5AXHoIZ;li$>II@(yqrwY7~faxhPp;A)8cU}!$ceL0)-psxhihqjtdlffSi zzUWSXqmwr^I>BXTSFaL8+=2GI9KV-G60U?0SC^yq1x5wYOEh();DnpbIJ$eJ;@S?C zU!grh*yJpJ_==$!-4Bj-w!V2GgXZ-$no2kTky7AOYqp;v4v0$ zETEXEMXfA7W+cfe1LixPoslOWtZxi{`SLf&$SZM$hT~;&TmZ6|og-ne*JTKbya3uo zWM*R%7)041-qnXi3F&5Ea1cJLSio!|hD||`?x295RaIA~hampo-JOM92MU|jM`eaX zSa&fkOKMycNXBik%*hD+oe2G8u1VXufeN=$SI}22LX>?Y=rC|~@vGa*N~|7}!XI7& z_#NI>S?^Ev9zMgJ`Qeg90Bx@63wQXz(1q}}DuZ(8Fn44wOmb6ubS+K`8 zHs3>l?YN$*$2D!>>4&p*Ghe6e2>NHx3A;zlT&=Hevx{tQdm_Z;XExAsRg6aUtFzLw zpxn5d82bS>>3G$3%*qU<{noN$+F{PJ_USoruQo6E^Nmqde)GbGfGxQ}50889Yj?@v z&rRrW1#1ceG$V%YA>Dx$udSzcBhK<$>os#w3)x=>ZO&xa$5<$)_Ysz`4*&7KmF7jb zu_MUJk`XtI+eD`tR#u2+*ryv_gwW8$D6S2ZrMr7Y5W?Fwf(2pb>=3m+@dW5)xSn{x ztCI**n^uEO(hY|p>(nZ$tnm>t@l5fH+PI)7lI}hMzBY7*+Bov^@^X+ayA^@xXc0_M zf^&UqJ+M`V?IC5%vv1gTpJ_d|=$^T{s@3zV>yPX$bSaCdZR6dOjUWx0-3~DLz zubXB4hxfUO6;yn%Ikvbd7uSY9znv#-DRsgrtA}$60Qsy(?qV0~r|DiaPF`uBly_N} z+-7?@$@X$>9o%kM-9T)g6=_?{SaP)##D2whIUUV#AN4%Y{I->TI_jmIhhe9QIthGHM&s1H@XDgR$ zV68*@;%u`#x&9FKMGX~71Z4EgIXpFs77|vt)4AnCh#p3bIaQKul*HwImU1^PS)gvK zh~C)-H)KYBn^1UQ>z6**0Pb7sn^8uhFKXTk)a)G9rHW9Wmbz@88y7Q~rC*AL7I|)3 z2F?MIhPNd4^~{mm)scV#NJcBIm8+F6OqP%s?W)dEF1y2mj~OFfE+H9Y#P^g*oR2;x z8J(xTCN#C_nl=RAPj$z6*Kzl7NH#o&8RmcMFMj`0Jy~M8D7VL{=uGh{cuY+n+SniB z4oA?7S8f*lIk8PhXm2wYtI%ek`}Cxu+$*-@ku8#yDZLY)EHQx2M<&9=MMm^AwF2gy zuC;X*yRq@ddz_W=@88vh=a0ZJSEjvTNAneB}N2dNI#eBn7DSfvFm&DM3E`&E@$tc%~Tva+bV(5d%7wqPQ#P> zyfw=ahH4f2Zj8_fJ6Nhmg6rWmgGPZz!KLZb_=S@jF5VnHCS5Q>^roYz?zY=UK&80l z!oVb=qzDpBo6vFypB2dy+z)_&lbERgiVf-VT46&YrcLKu(9nEqI)0bwd;23WEgrkax>;k%H z(XDRmYZx;u_DM_L7j+@dSFFa1yWfTpPs&FPh6z=inhvq~=L(;FMM0)|x@ZCySfnBl z-Z0w@+5GI5eAbv1hZZyvJx+U=Rp#rKG&@=>O*AQei=ys}gJ6>-;HX49 zRJP869GtPolX)3gEZI18lm|u(93`ihPpqtWyHhJldwajGPN3Ge5C!S6xO``&QJGLj zU88d^Y5eJ5@6JEnphT>WiXU>97MG>*(X(zsKPzd+q8oovU89{nF=5!ftAddQP4&d- znHsdu3P&m&(gmnto*}!lG|Wb3O+ZsMO`X4CE5%#;>ybl-5liKqnJLU~tvXVy6Fin= zl>vX_0d2&kjk$bwkMnEuO0y0SE^^yIy^gZyJkm5*!BZg{i=*nu+TdO}1nJPOS*+ee zcKU`r&Di*85rAy!uHdcxz41bbwAfy5(M!rEoWt8C=3QSM=v}^Lw@)BB$TNUcSHFWg z9q8iBnI3A<(EcDp2fB~;zF{lc6ybuuvV>XDJs-u>o}}-!6Ltc03NfazXEYYboPDhv zsTdv{z3-rahsbF;m%+Z?&8G=biP&`Vuu$QmSDsUvf{T5f_2&Ue7w=A2$zg)%^bJbS zye?c|sV2{{mE{Z54q8qDp+(%E5m0!0czX*AQud6ciDh?NZh5Oi#=k5Bc%$)Y-WiV&I-(XeF1{duZ zgHMS?pH_dbzpwP*=`6Q8r3nX_x=ok;pG-|mqEm+)UCV$MHAG1}1eQYL)a``^yd;cI zK+T_=mn?5yK@sjK(sg@Z_+TlrKMv++C2}c$f6Fv&?@CPZbllHT5mgdkkEjGOpExi~u+h?VHQ+?rt<7URLcnB#cn(plOfLD_P#GT{B^fu=yR!OHZ&MJ`D zJ@sGbJxQNf?6PP$(wDdod0b*C=X2bo$>H;B7HfoNa9gM&$Bfpr8A2rW;Tis9n23;3 zsOb)inIfkUPX0d|YXY^5aalo^An4mkQjFq}W*<$NBD7%GA04U%tdY)>=|2o3T8Cx9j#e?&kvU zLB3i9#FlEgNen67X_pI0NJy|&Oc96uODgxoy%mQ^@Lfkhs>Zv?JHZe}{(5ss3l&%n zqTR`kyXO@f6{7@mq2#K#Gr@f-hQ5MQKj#CdK~@|3k-=T;U1}#Gvu_^^+UBby1Co_q zUcJPhm<=8Bc*AM_yckE*D=wYUzSaP$1BgOwlNV@1Q=!98ycq9UhlN@tefZ|E-I}1F z%>JfnBmWqNX8aejKD+1ZS$!8!3Ur$^76p-d6b?5C$I z&KIS)tLdYyMCeDBrcMPF9Q z(09P3#%_Pf5gVqd@GtZieh%wFj->?;eAuz)$|VC}N?}=w3pyu0;GoZV1fe?2 ze1-jSRK&2(t45?$`VnaYv4&Ezn;RNGi{T**Z@3zU{`Ac_L@&%Ft0lVKt497=zU}ODlAc1?D9GyVmre9p1)San`^6#_azqu7?q^cUSe`rfg zVFFc^o;b&n5ioaRe%GevF)QUy{* zH?0#A&ew0l8O3eM@>?c*7r{sDtZeo$EQ{@@dFwXq*`LY^IK6z`xD-?O@H6$$jTtE2 zLm`Dtw1J3*6{+gzB-qEY9XtTta)VrAk`AX$d%$Hx6A!}@> zitb(xF3l3hf|b9BO|QvMuq$&?D^a|&qt?@Vb(kGi)({XR=vVP{tuKEZQX;ib`FcrQ zUW&t2?FTFBgvj!;gbhBLh>!6YqW-xe;04FoT!3v9NRO6i8-)AWQ z^xR3ErFtjbB=kh|Ry}eo6m6-7bV^wyGz3@UUbg;+FicAL1q2L)l3Z5#E)Hb^nS>wC zH5iY$skxnQ$c5(`lJQs^0mHC8SdK(iFGhXps55ifuQFpqVPxOLnzp&MHo^x!k`(p( zQKKEeeb0|r6)p8f`XZLe(OIdOUjvlOHVt9w9C_}d<2g{>Kl7=1DD+NQ-FU4-)b{EBQ z_S&Q8oZDbm(F5|OMRtr74pFMCAGTNOPC_F=DoN6MA$v_W=&dMf;nc)R#GU>f6*H`h75u^F7P0O$jNy)YB04VwYw#b*bGj;y!oguqQVw@c3 z2*??9*c0nGtUNgg_?E^o?)@v*P1xNPzd^t zW`rVlUkR5-%?8Dt@Kg4zXI!^-8k|8+#RbW1e%>qTaA;R0Y?@BfMUdPo-$V=&R=wi} zz>ayv6>7VM_n00&h4GmLbsuJ^Hdz!Y)qd787H2p8((hP9ZOv_hmBinxV)BS>2QCn% zd=-gp%iUeL<%{aZwxQIr^)Uu2b6`p~0(94L*Mkl@39>}&*{ywU&;pAuB2{}e-N%P6 zgt^aKo?i`6jNRT|tHT>$GwF07JRdH#51$?^EcMLwd$*D6>g>fQ4zwRz#$ zrXL?AR5%CAhZHvIU<%#fdOy0Povhb0hc^fY3V6gAc5*s6%kX1`B6Q}tK;}xu7cAm|ph6}?4-67u~IAITnYyI?+D4#1Z!t8@% zldlAdIQ*6DHIB{FZf^T@SX#&S`buetNT{70?X_-YeKml+C z+|Jj%k7s#a{yoyy2!1w#jez)sn6+t;M#@7BRGZrSjXpAT=yKz!z1(}C41veg zI}-%_G=tbXRsa2w^eH}ZR?^Eux8&8=-RfqR(KC%4@Q)oTrDYp1Hkat>>1$)u)zo@V z&${Alf4cQw($9JB5)jb$`P}zMtkrt@)#peD=(xTQjg74q+%4BWxOj#x|CRWsdby`% z{2?0R@%c|)l^#`A{SFKyS~ZMRFLf`?gu zmenOW@BjXH{U2V*dzavy?KHb;_HWFb|5V(;Xh#t|B?H1qEB{_b^bZS)kKl<$x#ZHP zF-Vz*y|({fKJedsc}(K>LLd4=DE7?1S2Olx#WBex*P+bE?S5XHll8E8x$Yf$u=?R23|_tqo=4dWHx8fBd3m9P|-Andk_9^XC4-$H{@qOG{U1 zRi1!w`Y`b1yOHkQ042vNaYzrLp7bR6?LT@V@l@&`zXx~c_EAtDehV{CrKRO%MZa_2 zn|4UKQX)u%Nm>My`LDf#OiWGXp!Ns8%55N_lH?ZYBHbdKla3@D7WI;D@*U>mUiGc) zzAZj%l+@!Vf$F$_kzLhizE`Bv{wZIR^>DxblV$IzrrHSni5kn+4Uw$@xvmT7RJ*7t zwS4KwQ&#<~?va*}ZDCXO+%^KA)v%>eR?$*b*ZfkwvYGhq%Dz5jGs3mMlo;_Tzptfs zQm=+^DQJn>C47qA<$A>{(&?7j1_^(T?f6Nhes%2>ic~L_S zy)yT&v>nvto2qZz^*LrShIt!?CgOQcn@Wuu3D5mx_UP_e>){4)|^V~kf8OJR4?6dkef)6#@Y5k{;t-9pC z#ZXP}ZI-Eug6;s^FVT41LY@GtSiYt6gg*xtRtZrn!RJ97M+mh-Kt#4y_mM33qgoC!MM~O+#YxP_l9;)(g!9%+6Xb?^F0hr(|;>TD5Ad*3EE*JUYjqBI1Poa#oXEmer`tunbH+@NByIz~vLuC0V zMy1y@@M&JxX9ti^(|1%6R*2)tPT1C8!XRVM}>z0-Tzh>`E zv0Lyx_f~KL5x%_vPCQGyzO4tcrO?`Axo~`F|5Xh*y>_o^v9dvX`Dz)?zqMhX-IYR( zVa!1xQUoX?8r0RI$rCt3XPk-BDh#t!kkl}Tn|5Qjqd_eUjxza~`PZEw6GsV4^BjwG zOB2{+mt&d!C1b$R9+_mF=|o-Z0$sfEOlN8cZmNG+8Ku{0kAx;wylj8> zzNM*VbE$HWYK-A3zHt$d+v}m+2GAF4;ge0XiMfNk{d9@o)nwpvre_=EqXtRk{ z173*Ex_zwJtmEQ3@R8{-P4^9)B35R)?gGby9Z@>h2hhb%E|*#lF<`1qN#I||1IyX6 ziJPHe{5(vxi#Orf<^U>%f0oNaNuM0F1}A~Y_!g#ou`GM6Vq<2PTp)rm34|*WS@CVt z0NM;>5>azJSOd7<82Y=83J|79rS}D+Xgs+3hZmVp>v~MtW$WqMmP7);HHDlvxTgb6 z3e~c;awqmn{_+t?&6o7-9g;m*6F@@czs`B#wo7XpDcEAAlSh!`;!IUp_wlATE5C4U-)Mmtbc+v3;t0J9& zqv|8NoFr#}?&piGtM@0os6&trHb@uM)0#qtO_aED&i=Sts_ns}M=!U18$Gsm;z678!uNy52jy#?aXVEG z_;e$$fM|ORIGpP2J2usQC{2|H3Rp}0_*-%tJkoM?LC#HP- zqmoiKM?>{lXiWn3yxCg?=nKTnG6Y_G@L>=5l~76h3>?jafCLMiL>E}u9_)+bt(Z80 zIb~oXEmW#>rF5g-&D1n){h}Vly}^%$ZadI-@TV?@WaqEu?a)PF0}w$h>SlKe+(LU8 z3$%d1%XqdsQ^~dJ_xSFOU*)WL_cHkWLLk+#=gf6IE2BGiUNSkfUQ5Ezlqc*(BOLSg zP{=uLp}gvPCGy;Ks5cC1NycSXcrEH9OlFbtl@Jh|>>KMFR5g70-cGThigNDUbt+eh z)sikK8Xg~QK;qBOKo9dyF|sjzh(_c zMZ5!ao>POes5E*=r&UOh7~)=-l%Sgs!K|vUZ(!DpW6M7PFgEI!%viNASzLiac8GIn zOO0aEl15$}Z&+&RlQ>SOk=wj0jZ_#v<-0v(;pzFxLcfR-l8pJN_ic9m66?-$Ud}@* z{$;6*JCj>X{$+-5?ccZ;+h?6_=Q0|Q7r+}xd=p6*Q778ppPmNEfVET1}W1ygg zGhwLcHO9szW`#ReGrPc%bjje15=hyRMzXhobr|(ooW~Hdevra3Xfc<(XinLdS?-%& zq3zYl(t@H6FL)<04{xpSu=GVgyQmIpl0syAWf1_o*8!&2h6T}+q8A8tZmxE#hx9zz zEC)xhlC2t4UOGjMqH~9@3(o{H*KPFV$4OmSNN)Esrl=9@m!)_q$kKv_^v~PuH!{H= z3#PwI!0WC($Xz7BD?|OFd~Qxbm?17jK*$qdk!iK@ePm>8LrgL?qEH+(isJ?7LSO<( zF|7UTDln2`wzPMCiV>yBbln14vExmLmON&EJyF{UQ4 zk6@yvR;-mWfkq-G#9%?f1Y8=>ti)`&%^myiTxb5j^q)o44_ zZO?o5zf_h&Ens95WMYZ6WNXBlH*FzHu_-uX=^*?fHQtI(FJJ20&t*f_?d0EE+b`Bx zt)_xNqi@bHT?J#&-HNk4*ih5Y%@oz22NPJVu+0|J8u&W5SD2AUPnDb6ZMa_@WeUGV z_E+@MGIV__t34~i{HUQ!%GnE7S5`9i5lz0i8XMkDnbuNgenr2aqxGFeb+%=1%3yMf z>4g3LZ>*!1ohYv1Q^khWed$cQ9?2daZ={jGwN<)YR=1U_E)(^sGum44caRol^afSk z5VC1c2eYAH5MaQwLsIZyL?6H7XKlGLetw5zBWj4D;5}w!`h=~pCxAUo>2nZ)si}w_ z?R>`)4u><%EP8#CU6k;|?QUIv@19U7AFz}r{Uo!*AZpM4l>np7PsC;zy>S9gD%!yV zWS6l&za2K%m`30w?dEk>8!blLV}Y;Qx}x(3jihiMYtCR{w(nBBE}{c9NI)P2;!uVU zdMvjsAqYAVRKn4=Z@BnDfnc*|?|AE?Vq-0Q-#y|1Z}n!N`?qbX)ov1%taQAxXHEZu zd<6SMnNqQsi~UpgNoVRWc;}fri@kTe);of|E(6=F8Cn1t3HO6L7meg5cL|){^EUV#p_`vUES@WEA){+4-K>z1f{jd8&h!WMp7$S?y!*T zyB{tXRWCU2XJ#(-x!1-h-y+~1D?hE~SXr!Rnw%Co8AHX+y*lQUs9*dns7g_0t_7vu zc@xdrlvWcU96j{ObuJb6(;!tfy=Zc5!FA|NyAJ!SfFp7>)HL>l_CM5?rbT3?*4~ms}`ep7D`>#&)~$} z(M$slA0{kpUn@S$MB!UXANjPt#C2x4%P)6LGzxca7N6_fs=FhnrGyLAk%@I~+18vC z;F^oD-zX8H8?3UAb@(oxDqn*O7yCeNppL1Vil%3~2c;_~`bB12FW&P(AyDb0Eu;EqN_D0fpREm;1E%|+^YTh{`FLdu|mJ5rlK zuscgB+0PHNy}~N#$f-Jo{@B24kk$u0xon3DP(Kdg%ET(UhY;;kgH)7T zK*Xb=>>|6Posx6!*Vg9fv)jCuVB7#nT#-moK`AwzqY0#)G>0lc*1z)<>PcO(wBNs2 z;kPmGigr~XqgziIGp%e_eu+^83jDjFT*ufzLhEp1rI9yD!tyh_3a}v3I~d+(3WVU+ zc+mAqVYJHMU?`V&(> zlA=7S`%0{$8cMOq99t&9Ib%ocSTF|sEh!vF)T`Bu+;l4vluS++WbRj;z0SAeQb{c1 zq}A&DkS?^dEUPx_%KK?|x8Mb2@XGanRGYNw4 z(B)J|&YY2jqCe^5-9p2o#L-IeL(M3~PR4n%Y1Kj7mYJcnPh=~1RN=I;sWpZ9U`@nC zM0;s?S9*>|*ty)x-e%WvKRcE(Hl*id4F235BJl?YG*v_QFN}U%+WF3mO?7f+VJvtr zeYMgtnz!X~z5`U%u`MZj&qB#93H}>1@vBYL8U01h)U56~dTW8VL@#v&dv89jfStF^ zQmE7ew5Q-67U8#_xaSJn?IfaL7>u$d1^72dI@|Ks4vE+%TAJ?{?@pxM`ec}I-US_` z`8dh=u`JGWs1Rdb z;m0;e1$u*_SF*UHu;`9c(>6!?|L6i;6sokVXA4T{hl~H=)~f71KKd^415a zFi)?cMLk#;$sc;TjyY~t-+#Ts@w;8(^}14%pL?9H82ym3O&i0Glep|~CY6>C8|p?| zvt!&1HR3gL@BCZlgm9|50r&HR%w!i^_u9(RSI!Y5WkZvGw<${eh37*Tx|%6&&s&PT zB1P@E^Kh?bwd4vLtbHKW6g?WOvU8C|-7U%-?R-3+hZ8naCoO&s?4&PcVD9*X#Dkv| zS(Lb;vC+D?{p8Jh2kX>4w^EzZ^a%l}re0cIY%gk8-y-Nms&~pn0>uy5#e(NyQS75E z9A@Kh{u=dDtts3qr^^21#--j9(jkhl1BL-6=bhzR)Og6`6;Z=7Yjhy9SZ~}pL9@5f zukV)W=;)Z5vWnkVeWqq;XxQ2^H8F8x-B~yUGBu730Ub5&2dwwfqt%;p=XT zZES>Q`^F^3-*yJyEd2Zt!Z5 z&d>I-6b%SPzh;YfuOP=xJ4E{T8Qt_uQrI|HK2o<*qt@lU)#vTX@Wk(tQ?yL%C#2h% zdmnl@uK6c+RKJm5q#ACXaLP8;{>o=2_E|Wl&{Vm|^WGG<+zW@6sM?}F?83+}B?q&1 zoxHPW?SN_Dbg%eHH7z9_mh}Nb?T=zvlwYT#EkS}^l6y6d8WVQPCnt?oTplAdd)A%a zrbY+WK`vSq`|D;SojhvQ7Io7Kk8o`aRmz%`=Zg(f=_W7lTIiZ1?-A`0Ifw>}t7@Q8 z5=EMR%bE*J5+41h3gwkGH6>0H?&s>VZjqyS@@@K}%VLxm*JxD^*2p5QnV*CU;^EBk8%}s2mMEq^CjI<1F!Z-^fo2LTY;j z_oO%w+LIouYs#%UM9vaYv$h>1%0EDW?ERUcsxpVY2D-Muo@g(0Cjj8s{(Lj|wTQ0D zU44=0gGxqW|8N9xFnU4A*<2lU&Rie|y5DH1DZ+)#3&ghSgwG49*H}4pbcA|aq6noC zZEw{z7`P{u^dz+}*2gw`+t~D97-`K)F(Ks-&77kB-5|Lv!$Qbd!i71vYMV#;22|TF zKNP=>8fsqX(-KyUJLy{aX?!cYOx}5<^mKgw^zCnMrwxL4&nNCwO*9l0&dMupYPWvB zRpLWWtlgPlm$W;tRaxi0X6;Z5!MYKV@{>)VLHkfKgs3!8k{xJp4{5m`L|?d#(z9nT zDa8DkwiFkxsO#^U9KYBF5LN{i|EYYCNh~>b26N3CIk{?e`xxI$ICO^lYSbd|;im%? zgpu;Zxeo`&w)N1%2$?m^#s!J0a#1hJHDg+tO7Pmq?0N~^sL#0ILGzBJ7hw^z1nwSe zK$og>DDMLb|M?e}f~{(;S1X1pFq;8=@UZlH>LT)G!Y@c|vL(R~!2GJ%Iiwv>dvC(w zwCg-CBU1Deh!$2O!Ayk4$1Xdc`Msp^GenEZ#rwkUp;qoVZaMM!ha6?}tJOOiZH*b_ zjQh(L1?1atC&VOl%w5`qZEWuno)Xe03GmXH-xQNQ-pKfExT5{5H#Wp-k?+k0PP~l! znJQ``YlN*K9Q&mA9>ZG3<_P{xRf(X3D^E zNE_k*b|XER)-R6)9AyO!sILqQf;u{V8qrnJLo%e9%Aeca8ZiQ>p#D7(z*cEQNb4DC z`q21A3{0;}ap-gI^Vg6zQpX?1^!M!HeI$EZTEqC)kMBgwEmQ!_0##jW9D_SsyEcvwx+Jv*|jr6AL#dCuOay6(w~@wMy0pgtYh>kxlK*cHBxfL!5w(CFb6&F%rb_JW0o5cUO^$N|`eDiHa5ktf;N zVG^XUjR2mQuLvl0hX8|k6lV{}Rubrr?%A)pk6}6kb8aRWt)p~kX4z^PD>|2x;U>*L z>_xUpAAlxSihLCB)GyFMoe>j=Y5yen`0L^HIHZb9?_j#IPr3nH@eGSbj{M>YOhUSG zYf%$52|d$NUtO1Adsrbw%kxtEp@6Hmr@JWur*$Infsy{zKA`UR@j*{E_+bC|z) z4gcIAN-RBK9M0>isir5Yt}2;VIeVo;>Z~HPqK_NMWhJE$08r-dujIZI#iPfD^mAPi z27R;%3SeqWn1roXb)}>8p<0li7l9o4MRs6TgeZ~F<6~owQ1skuwbWyqV{~=4&GiF? zR)5}1_Yn}_^P4HceJvc!Q<&CJ9cvd_rOrgQ z*dt6gM(sK`%ZA5;^jk)GZmaRhX@cMQiNZ8gdpgk#tA-hVuaY|`T6CWmI1+(bl7ylXZ= z)!MACEN-GqHB1d%!uQnCn?#qoachB{bUORcyJ4JW{F8Ub;k71{5$VBGcO2K~V| z30hMk0bb(ICkw~6SSuD$GZIAc2J|KR=$|`pnx?Lw32#4)xtuxFiThKqAK3(p+?8_s;sx(trsITFYRwe4F+bZ#B^+DjbBc`Kc}6Z zGV$%9&g!%vr>?q0S0c~Z-$RoYs>&g!G-HBLcyr_)Za$t+cSkXE7gwA@RV*7eYT-P$ z6O5AZgfg=(Yn}C1YpJpc!kypm-nwbBQsM8Rc&C)rOdR{>#;(e%#4`uJ?0A>m8pM5m zbyO^uk2CY;xfW)@agD48?4U{&d7M928|Bbh^0)<;gV+u*Ou;p2)Q|NckHdcY+XeOF zthp!e5ob>k0PE6&YF^d66j%Bx98a3|gh9p=)BSeX?)M-6utxDL4_FNIu{w%yqrZP< zb2%18BESt&*02rb$<{-Q-$CVc0PRb+$XFKsD6)e-Bd=nL0jpg;H=`g0d@x;X5K8l0 zXTcq;p@}tET;fj4Bi|0cc z3W;eSOBrr6MhpH}MQGymaVaJ+;VUv?iaJQiESbx^t}k=00ah?LmyKtT_%I}4YQfA|5iq(0ZsMF)sK916JBG(`8Dw^@py{N)&3ThH< zWPrZ`vn-fWokmBbm$Dgb$f4pp_M}2!9#Xv+n%{S#K86;#9aYu!>?rNkEK8m1Vt!C} zVJ1rP)&hMJYl9Ao1!3+w(Q-DrZfa4HT{wSJY&;J!wLL3~uvR^?En_&6k22nR8f-DU z-hJ&G3!@_7Z>SZ_J0Dy_cU;lo?6CXhJUy?xk^E7;viN&v>ggS!QY)9Ncmel14RJn_ zmu1&2si@6vls-yfpy2%y-`F6}5qWW!SVW93>NR=)M;YDkU0gYt7=EPX7nK#N^3B9? zmju?;*(j_OgAG}@mjAwUn?AG6PhIsq=~ro?nxdHY`1MJMyy`+G+Ub1ZnG!y&(3lmg z3t1O|0q+q@{H6NX?CnebiK|)p?E3WoycYjWY2?!lAhvK(3u5>_AML$zoaH=a_k2FU z+i>^fU|N}f$zd&;m=8Svy~)4JXMrO`>zSy=!RTl0Udu?{qe?b{Q-VvYoLH%mrQ>ro zhz!|->_~?-T4o;bM$UU6jU+)7Fo*$>X;(?unPa;TKmVnq=)M5*p4kr%`e5<1BcxXsoe&y&!cNi;Av_@^1hsJBRyK~^x9UGp5vd3u*wR3E zxkXVZPAb4WO)=zvD>Ct!?a@euRYo4BQ)Un34U``;twG=DjxZ1oa&=4iKcoD@1h1N} zFHPDHIrWKrX7~!uXwzt2508wB1=SO)XKJ93PlcY8DuoLG z>*OfHL7Pyi%F}WK(&jI7L1#$liN>Y9av<@K1X&V&4=_@XCu+3B^Husu85}DIfszRz zcas)TKu1*rb{=?fLtrw@A?7c_hPO@bmFQn ziO&pq+t+9gpS>!Dgt?v6&O!a9DDp2;2@UcM0x1XUYfjdvhl9z{e@TUo`#{L69CXi8 zWW(5|f@z^PVL`_15XvbQ(xRn(9qC}?e+ugBIJxlkaO6gAvp4qIVOCbPYR`P8{>u9_ zb%uQ8NSj;@cBt^pz3g4or@qxb(%I_28xh$T#)BnZa-|rMclLUmp3^&oc^Y=A-GHx( zkmIm4Bqlvqk^d3NEB)7}X3xHdr+*7P4_b_KS*(E@^}?k)fwumL-&}|choQGXI{(-d zJP4|yCDD?FUl1>Ib+6nq0UHIO#@8l?y7*QWTJQ7Ktc2bK=MSe1#dSn^Egfm*jsPCbRn3JMJ$lSWQ^6geA8qK^c3U+jn7S2_W|K9 z0uspOVuh`4UWBymx4=7dUkbH@HV;UJ=R6nMqB&WIN?IH>A00}<396bX^*07Ut#a>%(4$5s^ zdz5PyL4N;unD<^y;?^NuKGV-l;eV-!;xk55W|Izz!V&Slfpt?Y@82)Q{Ljp}J}@Z7 zo{ZB_6HRUds_Eo1R+5gkwMPTG87b7$)HU!(_N)>zv$oGm*s2KE6@J*XbhRlw)jl$+ zVLq5zQM}V}jBo?mws9#0`(SH2oi8HRJIRmHqbZ&i&hN;%9>!TYuFfyvdqS9a{kL|Q zeS00O|BFHMPfu2fRN|5(Ih%?(#cGyVm#RPEhJDy)GC{adDVq_(Qwes}M zk2u}IBXDFj5R70?q?B5Iv-<04sZ?7WjqgmQ7%>`~$>ZHh!!Vl|M@m?D4&2-F)fXvv z&n?w{rj-%AymB31hdL7#^Cn0(?0Rf`wo402pWE9~q^09U21y`15bS}WR}u0~ z>I8w%a`(LFWk_ERLTD(!XSFtMGy__XzC)W3LO6kVDjP6#Dw!hCfVQ3f1c3IgcLJ|u znEy#PhX^#tWl(PbPac9CKpd>sw-&vP5TXeXA{}ZkyC_8x!J;L3O};`gLBgPRW0q@; z6J=C)yACY*#vh*TH9d1=D^waa`Hk-umz)Me8u&*folcuPx57|4XG>@SXb7Wsu-W8OrkwV%r>=K;-ju*yR;1<5a7C3e^f`0z$Nuf02+> z#fm!*A3oHuRWqSGs>^4#ptQieb zt;=_(^Y%&>RH&3H>0IQ&PMcJY6nJ`RJX5pYDV@^)8B|mBc!S2=0dPDc)vt|(tT^_& z5Z4NXEJwOUhJQBEpd{^4eRSF4I?{9h<4C7O3KY(l5060v@)KV-!K;mIv|nz_W+b;p zVIlGSDsJOGk>SADcDA*i#I#o3v)0zuQ(G6OAaNxO+9N{_nV{N{8>#g)0lzq#k)Li6 z5YVLp(eps^v#kPHL8IpuZ?$qJ%Fu4KtKF=kk5aSt zg(EJ{4wh)p1uHwz<}tw9ZMG+J>qMVA%c%6*ZM5@%xjkgMyK?J6SpXU2p}>vM0~!uH zbP3)(>0ovlJU-b8J9I~3Z(^akl8B6T&8n8NxK)DgZ#fY5y#$3Y#Pw-HEGc&dXnhs- zgte6+xrNeqg3nnUZG-2ql;3t#3S-#=8`;^@gDKeZ;ae0_f;Z&ws4$W$ejR@CFNbYu zIwfnIRh03(r3W-#JtLPS`=k(0t&8?ZeYo8PlGy3$+7U?YEa&@cIihoUHT&Qr{5)Qh znj^^K9L3*wdGJp=zFUrU-qfD$p;wvoKe2uT>=D82^UI~wQbBb|he6q$q458@{e65O z-I?)nZeeS$9Ww94SWc(wTzcuu3Q$JR^W_Wi#2y>+^eQ6)X31N<4n~ zG&5!H{f^bfU77PNIr}D6X&x>}$CS7Oyo+s53h?&ZL#beIJ%OIQ1BWUpTH0eB)y8RH zyN{-SwGtz0@v=QFd9C}|MFWWoIR zg*edutRJ$diKKR&sTq@HIF>9x>Q8x`gb|yZn)0U}pDqQ5F3C}?>?cVXrDTUqD*+9V z044<99|2qQZ_(6;VRENXyd^wCCn{`~4YZ%uJH>z@Pq*3dkfAogRaW-QS#(CnI6Cu) zdAZf?Klk(lWlZ$Gk?p$_HG@Vg5Tl|3T=&Pm87q8*)pa)zBys8Axrfs4Ou+__WGKEk znsO;JaMOuUs3W=#LE7(<2Q-qOk>&5BRa?^lc7dCsVI= zX(o43zieLeyV;aF_JLx%@jF!(5@9QAN{Sow8Fgi*D3ln%k4TB^#Z*r3 zwB574mMA#WGr+x*Mt*TfIEFsV-(?|czWOC}R>&E9SPk~Ir=9ui-b@skn}3*!Ph7oH z!@<6*vE#sdsOe>y1h@7{MMZV)2|Rd`?H$)#k&ZJd3CJQ16Nn*@O)2i(7qI}HTKOnP zxC-P6$Zc@W%iVi1#5;99?r?ds^p)SessTwd4m$w>N&*d{_;&BD9TJhUH0#O8_^(GC ze-fe4nx_#Y#)?Atv`55Mecp#j}3f{h0DT_=YVIgNcw_#9T%s&uEk=B1i3iQc?avQ z7qYzj{z@GZ(WTO{zKXT$(KvuwZohGY=8y24Wb|DeyUJV4RNTX9b;Yi1&alp z;WpId{NP1BRQBrV`EyDwDT#cF$ISwYC~^EI&Eek~h+XL)hW0x+*Hv||ctLBD)&+?~ zGzApXM~Wmp>xz0r#r^%-b$nEjcm@4~c#ox~2eW@rd2s&8Y5qh6eNVlDv%fhKNe1HR z^5KVI@tA$>XGVUmyA8}0-A+{5*L>DA09~pYT@Tr>#ShkC5o6rk)Q~clIQMOUocqoM#G{FX_Zk@hN<}i%0E(rMHf7{@#EP>~~ z#DGA}VsqNGxTC|V>(g55)tMY`hn4EeAR>~bD^2401Byudne@d~|5O8BVXGRe{H|Be zDSJlxOw>D-Ok|5vXIlo29@EU*9A07CzT0K;K-rx`U#pk$@r8G=I7JQ9X58}36E24Dl>TcQ0rgMr@zmSW~}?x<#W5_r0nTm7L8_fc;{l_VjB0B&h)0*T)1Bc&`@4D;KMB}r3N5j!Tjfr4r2GSeD;(qj`y$ObXyF$_`1DpkJhB zpGiz}#?(us6yYVtTk5+EqgeO7IAMz^0r-*fN0a%r#9Hj?vMcC-KW=;6Omw0+Wo6q5 zP?9z{@IC_M!|5aH(Giv}1r}DU2x1Z=&foWFn``YIA>Zd-h&f$GWmK(AOt6vEyXu)@ z(X0{E*5@*k+VbN>#l3UF^X>*{Iox&4d5!Mo>GVCUo@Uai@8=VwU!21I za_FrzXYh3uZ_6?nZ`=DX&$=X_zut}JUou#E=DxkTzAj-^v%Cd8`j9=`dJkbY`Iw%_ z*s@1$+UtqGke+FKocaA&%V<*%IH^4CPgq*APu_EZ{NO9EK?7)hn$0T3PAg)K@myUY zX;0jMclLQ%s5B|FSFlE}8qtRz(3Z}-IzyUtMUGla_c`&vLzdS?F~X+6HERDRWR8R=MJ zx5RxTPp7vsN>1UYt)Os`b{cW69bdL01ie|EI{BlPI!5%sp!?XhDKtJC=OIsV=(HNy7Ln}G}ti%=f?4q8?gZ_=1^7vKFW;SYg@K;S{`^ArB8^7>7UM6 zzrU100u0XCB)0j!S09ipuOhD;)Ab%L(=lIqYM%TdERBOkMed5Qz@I_1OgFxnRaT|I z{hF12?LQaa9sI|Fynw`d{}}aNoafKfS^OSWs>|jKf+o5hg7o0uIa&+Vl4&Hj&LfBu zVX!%e>t!c~$#r^_B}5ZZCMG6;lcpA~AlS5i_WU!$Sz_FQ9G#nvzu$YDM%21lRNJq?Ig>74 zI>v!1i4FFIlZRvhFuyH~Z0aPO^70*P!NPA;rJ6_kMrrbMaK; zh|!Ax+aK#h`_9jkDW2=KdF=#=K+Z4g3nJOs*$YpME5HzATLpdW?`|Hc-?oZ%q?{Lr znd=bU{Sg6XroL`!L7i)Kq{xwpfB2J-XOVv^@9i3xSa<8-aXu*smL(?I`U}7jy~D-UKR# z!w>Q74-zIynELI(W)iu+)FsNgV7fViYv|VZN-w1a$ZHIFp!2u4g6@cKgz4$Oy%jok zdOzy__Eu1J-K{)>m7o`SSMc5KGLyAqRNAjM-sOMzq|)$qlBE>c7tV;GV-05eq55ai zP~GzP_b;fVRZ)w?=?DD2DR~lB@3-=JG^Y}dreL|V=kJxS)HuE~pp8V6ySLAE=iPGe zntZ#}8dM^AH>irS&L&$JFEM<~!`xpawmy^ar}XQ+a;3R8V)%I0U0;EEM6_{jxa7G+ zI{IQGs4cqaFURqY=XG{=etAqRb`r)btJN28DLjcaP1t81F4JUsS+%r6mR_kA>t?VXe0{30pKxbc~;-vjgAmBQBT{$x+YQ4=dhUFHoFAl;OCkE z<3n{XJ}q2WI=Qw<=`SCnZ+5J8X@wy{dL4!o%ddF3wv!A@=`q8 z_de90mfJCSy>yv5!A7hMn{z$kuYH~!UzD%^w|!o(F3*ML4mOQL3Iqi*IKkK5P85?1 zdrk7Y-K9)^Y=Zi2p4ixlo*LL5es7szUAV(ndEU`B7=B+$oq8aXEui0aO50d&MN<;SOi+R8IP%jTpZ*$zstggNp>e^Jg z&;xPxZ@8^Kgfl*vRGM>!G^(?nz7mF(AJo&N{(QP+S=I$u!+0Dic;kQ|+Ygw=-p%4R zs=c{XBlzgcpI1-D_%4SwLnpB9+xuIeZ^Z;nLE(C+Y$yVH(>Dj1AYr$hnAgt@=jJLK zF{sr_9U4Bmc~KR)qSuG(;KCWx@RO1O}AE9!*0 zme(`fug>(JpXtk7yfF3QOwn75Uuzfmou59Gi%VK%(^aSwdS2X|&iS)a?z zy&)5}Afgi!^?Vcr13YU}dvFTZe2 zm;6}9Mys!~3paPQ!#RNcFfT&&a-dPMuPQceV{m|wsq(7X+__;cd?@V`HL993fT9)376dMB)lAKYPY*OgincIjSpD zITN?~ICZ8U6EM?V)vb{`9A0>md~Q(eM(G@3;UP0rbqBR~B@`w`94NM79XE443&u2> zx!=T0KxGNJsbuHB^tY*c*^Y~A0P9`s=n^X{>lCQ_4_6^E_c4O>;oVXTM?@yx7A!zk z_~%tf-g-XQR+v=V&ytBql>y>-r}Em<9}kwH3-Jz^x7Um6IIl}=t*Af7L@+%a^)ZE3 z)+(j?P2t@!@WMDt*QCNGAy|tAi(^bmM4OJ1>ckJ zf<=Cf@ZpT^TY~_HzM2!&;_pk+IHj@yfL+SDjl0pEgXXNOZ}B{}3my#weEWjHoa$bx zpet38wU%iTJt}8zR_%Z_%U^hDa0w~4+QU+HV(dodb~aZ$(Lyb>txHL63E9t^fb)QH>eW6ITDSEr_Y;nIl{=4pL1`W#&NC2u(0^hUY#bL#Fh^OIUkUHe7vI6 zljQj{XMTF%Sz5biRrKCI+UtJS;zQxgLs-Y@UXh%x3U3#IA5%LAig?XYlLV(O;zsEq z<@Vh%54pD!4caJLYI2Ra-yC@iOYg(4bY(^Rgs)t=GH4L8`gH&|j!Zca3%hSAV73Hl z_UQuL4e^i{C?EA(9i&fLR{Z+L35rrr)yrfX{> z-rm~vLDvRVF-iMrV&vM+h;uZBzu?4s89Mc7-jP-}_dj<9WbAA?S+9!VMShTC1G#My zn1pJCo7M`n=2N9jPOSK!oPJ^anf=NUa~reC8|hvHNtsvm_d{-XMpGono2aA@IP10a z8M?RK-2CyG$eX<1$$6e6wYd%OuO@xF`qL!ag6>;ZHC{udKb@PuIwG9R0bE}Tgxi=~ zSO5Ilc+Cp_&G@nQ*IX+cO5;~WzaF8axtus!_cHBM99a5wDjm(odQN|N+W+v?i-xT1 z>^DXKXR}O0L>}A#?!4RJ_2+jPa5Cc9*?TG4zaX{U?f7YdjK4RV?%MuunzTkj|F1pg zYTn0h+_KqBxAtVM|G%Fve_Lh0{JvZDi@)Q?SwWGP7i$LWSy%)d*qsHe$a_M+#2H+k zat1gk?l*z)y4kfWOL}W14t=izrm3h6YlJO+y!&sYdiR^*?rkA6>+L?I-EHK_1~s$) z{)_s3u&Ee6_tieT`7WNX zJ^K66ZM`Bz>{rreurgiTzhr9@qp3*?@DfGoRDnzK+wTh9FYC{=%>|yITmAjovD&vc zH>=lKP2RFTZmCZB65Hj=_68*Q>wdfc?zYpwl?+m>-&gE7=*cB?N@PyT z?W~#G=I2f5yKR>l+nA7dv3U~kK!TaT7XDy#WbvU^kH z4$0sv?&Wt@8Ry^I+!`+~oxPB- zeg|lA<@dYg={$Zxdw+cQthqh!bgb5o6Lsse_y4Zl4IFkj0yF{GnF4izUumoEXZm;r@l61?fSX@c76Vr zdhp>~>20%g->)^^9~#zv_3=T^+5b1Li(4=yfA`l~WHoM_PVqSfn}~cvfSA)KcJr9QNSVnjWi{il%N0dZx~* z0X+Qj9<9gBvSb5? zi&C|N8k%`SvNtTA$G%xM{ts~V^|9NVdmR*6O^&XJdwgof5}nLey;CKHFBZ15F$U!Y zq$FSe3JKNT-sZ=?$`7jl1IKhOYlz(s?E?2xf4uWJ{chv3YkJz+#=j0Y2aDVXbw8IG zJdv8oYIP+gSL@|P)gQNl|IWU-wQ~2(y_wa|*#6r+y1nkAa4_VeJGA~xqY~(@XO29f zSQmxx#w83(W6pb46Ss^&q1WNK$gpoSjE3zDl2i7){3E>% zc*wEOJe!-ZS%6)zuUYPke=+60v;!Uvc^ov34cxuH-HxRi>!23Ks **The scope of "prompt" here**: In agent applications, "prompt" refers not only to the narrow system prompt, but also to all natural language assets that drive agent behavior—skill descriptions, rule specifications, sub-agent coordination instructions, tool usage instructions, etc. Their essence is natural language text interpreted by LLMs; as long as they influence agent decisions, they can be optimization targets for `AgentOptimizer`. + +The module consists of four sub-modules, driven externally through a single entry point `AgentOptimizer.optimize`: + +| Sub-module | Responsibility | +|---|---| +| **Optimization Algorithm** | Reflection-evaluation-retention loop; currently built-in [GEPA](https://github.com/gepa-ai/gepa) (Genetic-Evolutionary Pareto, MIT License), extensible to other algorithms via `OPTIMIZER_REGISTRY` | +| **Evaluation Bridge** | Reuses `AgentEvaluator`, allowing the optimization process to share the same `EvalSet` and metric configuration with daily regression | +| **Prompt Management** | `TargetPrompt` unifies prompt field read/write; supports two sources: local files (path) and arbitrary backends (callback) | +| **Runtime Orchestration** | Resource scheduling, stoppers, atomic artifact persistence, SIGINT signal safety | + +`AgentOptimizer` redefines "prompt tuning" as an engineering problem that is **bounded, reproducible, and auditable**: + +| Dimension | Expression | +|---|---| +| Optimization Objective | `evaluate.metrics[]` — a set of numerical, repeatable evaluation metrics | +| Decision Variables | Prompt fields registered with `TargetPrompt` (one or more) | +| Search Process | Reflection-evaluation-retention loop driven by reflection LM (see [§5](#5-how-gepa-works) for details) | +| Termination Conditions | 6 built-in stoppers + user-defined stoppers (see [§4.7](#47) for details) | +| Artifacts | `OptimizeResult` object + `runs//` full audit directory (see [§8](#8-artifacts-and-directory-conventions) for details) | + +> **Prerequisite Reading**: [Agent Evaluation](evaluation.md) — Optimization is built on top of evaluation; this document assumes the reader understands the basic concepts of `EvalSet` and `metric`. + +--- + +## 1 What Is This / What Problem Does It Solve + +### 1.1 Problems Solved + +After agent applications enter business-critical paths, prompts (including all natural language text that drives agent behavior such as skills, rules, etc.) are among the most expensive assets to iterate: manual tuning relies on engineers' ability to summarize failure cases, and regression risks amplify rapidly after scaling; coupling between prompt fields on multi-sub-agent chains makes single-field optimization meaningless; model upgrades, tool changes, and scenario expansion all cause "yesterday's optimal" prompts to fail today. + +The `AgentOptimizer` module completely **engineers this iterative process**: + +- **Explicit optimization objectives** — crystallizes "what counts as good" into a numerical contract of metric + threshold, shareable across evaluation, optimization, and CI/CD +- **Algorithmic search process** — reflection-evaluation-retention loop replaces manual trial and error; process is replayable, results are comparable +- **Multi-prompt joint optimization** — supports simultaneous optimization of multiple fields (e.g., router + worker + summarizer instructions, CLAUDE.md + SKILL.md), and uses GEPA's merge mechanism for cross-field search +- **Auditable runtime process** — each round's reflection input, candidate changes, evaluation scores, acceptance/rejection reasons are all persisted to `runs//`, supporting post-hoc traceability +- **Controllable and rollbackable results** — `update_source` determines whether to write back to source prompts; `TargetPrompt` provides atomic writes and failure rollback; half-written disk writes or secondary SIGINT interrupts will not corrupt source files + +### 1.2 Relationship with the Evaluation Module + +`AgentEvaluator` and `AgentOptimizer` constitute the two ends of the **evaluation-optimization closed loop**: + +| Module | Role | Output | +|---|---|---| +| `AgentEvaluator` ([evaluation.md](evaluation.md)) | Measures current prompt quality | Pass/fail per case + each metric score | +| `AgentOptimizer` (this document) | Searches for better prompts based on measurement results | Optimal prompt + full optimization history | + +The two share the same `EvalSet`, the same metric configuration, and the same `call_agent`. One set of assets supports both daily regression (pytest running `AgentEvaluator`) and periodic optimization (night window running `AgentOptimizer`, see [§4.6 CI Closed Loop](#46)). + +### 1.3 Applicable Boundaries + +The effectiveness of `AgentOptimizer` depends on three prerequisites: + +1. **Evaluation signals are sufficiently stable**. When the variance of the scoring itself is greater than the improvement brought by prompt rewriting, the optimization direction is unreliable. It is recommended to first run `AgentEvaluator` with `num_runs=3` to observe metric cross-run consistency before starting optimization. +2. **Budget matches the search space**. A typical small-scale optimization is on the order of `max_metric_calls=30~60` (one case-level evaluation counts as one metric_call), 5~20 reflection LM calls, running 1~10 minutes, consuming tens to hundreds of dollars (see [§6 Cost and Concurrency](#6-cost-and-concurrency) for details). When the budget is significantly lower than this level, you should first complete baseline tuning on `AgentEvaluator`. +3. **Prompt has optimizable semantic structure**. Prompts with fewer than 20 characters hardcoded or used only for placeholder concatenation have too narrow a search space; GEPA reflection degenerates into synonym rewriting in this scenario. + +For scenarios not within the above prerequisites, you should prioritize using [`AgentEvaluator`](evaluation.md) for continuous observation rather than starting optimization. + +## 2 5-Minute Quickstart + +Complete code and data: [`examples/optimization/quickstart/`](../../../examples/optimization/quickstart/). + +### 2.1 Example Task + +The agent in this example is an **elementary school arithmetic word problem solver**: it receives arithmetic problems described in natural language (e.g., "Xiao Ming bought 4 apples in the morning and 7 more apples in the afternoon. How many apples does he have in total?"), and outputs a numerical answer with units (e.g., "Answer: 11 apples"). + +The agent behavior is driven by two prompt files together, which are the optimization targets for this session: + +| Optimization Target | Path | Role in Agent | +|---|---|---| +| **system_prompt** | `agent/prompts/system.md` | Role and response style definition (e.g., "You are a math teaching assistant, answer in clear Chinese") | +| **skill** | `agent/prompts/skill.md` | Problem-solving methodology (e.g., "First identify the problem type → set up equation → calculate → write answer with units") | + +Evaluation scores from two dimensions simultaneously, both must pass for the agent to pass: + +| Evaluation Metric | Type | Threshold | Scoring Method | +|---|---|---|---| +| `final_response_avg_score` | Text matching | 1.0 | Agent output must **contain** the reference text (e.g., "Answer: 11 apples"), case-insensitive | +| `llm_rubric_response` | LLM judge | 0.66 | Independent LLM scores according to three rubrics and takes the mean: ā‘  answer value matches reference ā‘” reasoning steps are clear ā‘¢ answer has correct units | + +Dataset size: training set 5 cases, validation set 3 cases. + +### 2.2 Prepare Environment + +```bash +pip install "trpc-agent-py[optimize]" + +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +The `[optimize]` extra includes `gepa` (reflection algorithm implementation) and `rich` (terminal progress panel). + +### 2.3 Directory Structure + +```text +examples/optimization/quickstart/ +ā”œā”€ā”€ agent/ +│ ā”œā”€ā”€ agent.py # Defines create_agent() factory function +│ ā”œā”€ā”€ config.py # Model / credentials read from environment variables +│ └── prompts/ +│ ā”œā”€ā”€ system.md # Baseline system prompt (to be optimized) +│ └── skill.md # Baseline skill document (to be optimized) +ā”œā”€ā”€ train.evalset.json # 5 training cases (source of reflection minibatch) +ā”œā”€ā”€ val.evalset.json # 3 validation cases (full evaluation each round, decides whether candidate is accepted) +ā”œā”€ā”€ optimizer.json # Algorithm + metric configuration +└── run_optimization.py # Entry script +``` + +> Training and validation sets must be different files; the framework validates at startup that paths do not overlap. + +### 2.4 Core Code + +`run_optimization.py` consists of three segments, corresponding to the three core abstractions exposed by the optimizer. + +**Segment 1: `call_agent` — Business Bridge Function** (see [§3.4](#34-call_agent) for details) + +The signature is fixed as `async def(query: str) -> str`. The framework drives the agent to complete single inference through it; agents of any form (`LlmAgent`, HTTP service, subprocess CLI, etc.) are all accessed through this layer of bridging. + +```python +async def call_agent(query: str) -> str: + # Re-read prompt files each time → GEPA writes new candidates and they take effect immediately + root_agent = create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=root_agent, + session_service=session_service) + # ... send user_content, collect is_final_response events + return final_text.strip() +``` + +**Segment 2: `TargetPrompt` — Optimization Target Declaration** (see [§3.3](#33-targetprompt) for details) + +Registers which prompt fields will be read/written by the optimizer. Each field corresponds to a local file (`add_path`) or a pair of async read/write callbacks (`add_callback`, used for arbitrary backends like remote KV). + +```python +target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) +) +``` + +**Segment 3: `AgentOptimizer.optimize` — Optimizer Invocation** (full parameters see [§7.1](#71-agentoptimizeroptimize-parameter-table)) + +```python +await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(RUNS_DIR / timestamp), + update_source=False, + verbose=1, +) +``` + +| Parameter | Description | +|---|---| +| `config_path` | `optimizer.json`, defines metric / algorithm / stop conditions | +| `output_dir` | Artifact directory; created automatically if it doesn't exist, recommended to use timestamp subdirectory | +| `update_source` | `False` only produces `best_prompts/`; `True` writes back to source files after successful optimization (CI scenario, see [§4.6](#46)) | +| `verbose` | `0` silent / `1` Rich progress panel / `2` plus gepa diagnostic logs | + +### 2.5 Configuration File `optimizer.json` + +The configuration is divided into two sections: `evaluate` (evaluation, same source as the evaluation module) + `optimize` (optimizer-specific). + +```json +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": {"text": {"match": "contains", "case_insensitive": true}} + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "rubrics": [ + {"id": "numeric_correct", "content": {"text": "Answer value matches reference"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "reasoning_clear", "content": {"text": "Reasoning steps are clear"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "units_present", "content": {"text": "Answer has correct units"}, "type": "FINAL_RESPONSE_QUALITY"} + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} +``` + +Key concepts used in this example: + +| Concept | Location in Config | One-Line Explanation | See Also | +|---|---|---|---| +| **metric** | `evaluate.metrics[]` | List of evaluation metrics; multiple can be stacked, each scored independently | [§4.5](#45) | +| **LLM judge** | `criterion.llm_judge` | LLM judge that scores according to rubrics; serves `llm_rubric_response` in this example | [§4.5](#45) | +| **stop.required_metrics** | `optimize.stop.required_metrics` | Framework-level stop: which metrics must all reach threshold before stopping | [§7.3.5](#735-optimizestop-section) | +| **reflection_lm** | `optimize.algorithm.reflection_lm` | Reflection LLM that reviews failed cases each round and generates new candidate prompts | [§3.8](#38-reflection-lm) / [§6.5](#65-reflection-lm-selection-suggestions-table) | +| **candidate_selection_strategy** | `optimize.algorithm` | Which candidate to pick as reflection parent each round | [§7.3.3](#733-optimizealgorithm-section) | +| **module_selector** | `optimize.algorithm` | Which field to rewrite each round in multi-field optimization | [§4.3](#43) | +| **reflection_minibatch_size** | `optimize.algorithm` | How many cases to sample from train each round for reflection | [§5](#5-how-gepa-works) | +| **stopper** | `optimize.algorithm.max_*` / `timeout_seconds` / `score_threshold` | Algorithm-level stop conditions, at least one must be set | [§4.7](#47) / [§7.3.3](#733-optimizealgorithm-section) | + +See [§7.3](#73-optimizerjson-configuration-items-table) for the complete field reference. + +### 2.6 Run + +```bash +python examples/optimization/quickstart/run_optimization.py +``` + +The terminal outputs in order: baseline evaluation scores → acceptance/rejection records for each round's reflection → final summary. Completes in 1~3 minutes under small-scale configuration. + +![Quickstart Terminal Output Example](../assets/imgs/optimization_quickstart.png) + +```text +runs// +ā”œā”€ā”€ result.json # Complete run record (OptimizeResult serialized) +ā”œā”€ā”€ summary.txt # Human-readable overview (read this first) +ā”œā”€ā”€ run.log # Single-line status +ā”œā”€ā”€ config.snapshot.json # Snapshot copy of input configuration +ā”œā”€ā”€ rounds/round_NNN.json # Each round's RoundRecord +ā”œā”€ā”€ baseline_prompts/.md # Pre-optimization snapshot +└── best_prompts/.md # Best candidate after optimization (only if SUCCEEDED) +``` + +Key lines in `summary.txt`: + +```text +Optimization complete | status=SUCCEEDED | algorithm=gepa_reflective +pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +rounds : 3 accepted / 7 total +duration : 124.31s +stop_reason : required_metrics_passing +update_source : false +``` + +> **What is pass_rate?** +> +> pass_rate measures: **what proportion of cases your agent "got right" on the validation set**. +> +> --- +> +> **Step 1: Each metric independently determines pass/fail** +> +> Each metric has its own threshold. Score ≄ threshold means pass; otherwise fail. +> +> **Step 2: A case passes only when ALL metrics pass** +> +> Think of it like an exam with multiple subjects — you must pass every subject to pass overall. Failing any single subject means the whole case fails. +> +> **Step 3: pass_rate = number of passing cases Ć· total cases** +> +> --- +> +> **Walkthrough example**: Suppose the validation set has 4 cases, with 3 metrics configured: +> +> | | metric_A (threshold 0.8) | metric_B (threshold 0.6) | metric_C (threshold 1.0) | Does this case pass? | +> | --- | --- | --- | --- | --- | +> | case_1 | score 0.9 āœ… | score 0.7 āœ… | score 1.0 āœ… | **Pass** (all 3 met) | +> | case_2 | score 0.85 āœ… | score 0.4 āŒ | score 1.0 āœ… | **Fail** (metric_B not met) | +> | case_3 | score 0.6 āŒ | score 0.8 āœ… | score 0.0 āŒ | **Fail** (metric_A & C not met) | +> | case_4 | score 0.95 āœ… | score 0.9 āœ… | score 1.0 āœ… | **Pass** (all 3 met) | +> +> 2 passed out of 4 total: +> +> ``` +> pass_rate = 2 / 4 = 0.5 +> ``` +> +> --- +> +> **Back to the summary.txt above**: +> +> ``` +> pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +> ``` +> +> This means: before optimization the agent could only get half the cases right; after optimization it gets 85% right. An improvement of 35 percentage points. +> +> **Three related fields**: +> +> | Field | Meaning | +> | --- | --- | +> | `baseline_pass_rate` | Pass rate before optimization (scored with the initial prompt) | +> | `best_pass_rate` | Highest pass rate found during optimization | +> | `pass_rate_improvement` | `best - baseline`, the improvement gained from this optimization run | + +See [§8 Artifacts and Directory Conventions](#8-artifacts-and-directory-conventions) for the complete meaning of each field. + +### 2.7 Next Steps + +| Your Next Question | Jump to Section | +|---|---| +| What exactly are these API concepts? | [§3 Core Concepts](#3-core-concepts) | +| My agent isn't this kind of local LlmAgent, how do I integrate? | [§4 Your Scenario → How to Integrate](#4-your-scenario--how-to-integrate) | +| What exactly does each step of the reflection-evaluation-retention loop do? | [§5 How GEPA Works](#5-how-gepa-works) | +| Want to estimate LLM call costs / adjust concurrency parameters? | [§6 Cost and Concurrency](#6-cost-and-concurrency) | +| Want to directly look up parameters / configuration items? | [§7 Complete API Reference](#7-complete-api-reference) | + +## 3 Core Concepts + +> This section uses 8 concepts to establish a "mental model" of the optimization module. Each concept starts from "what does it correspond to in your work" rather than from type signatures. The introduction order is consistent with the appearance order of the three code segments in [§2.4 Core Code](#24-core-code). + +### 3.1 Module Overall Data Flow + +The optimization module's work loop: the user inputs 4 types of assets, and the module produces 2 types of results in the reflection-evaluation-retention loop. + +```text + +---> Evaluate candidate + | | + call_agent ---+ | v + | | Reflect on failures + optimizer.json ---+ | | + | | v ---> OptimizeResult + +------>| Write new candidate + runs// + TargetPrompt ---+ | | + | | v + EvalSet x 2 ---+ | Accept new best? + | Y:keep / N:drop + | | + +---------+ +``` + +Roles of the four inputs: + +| Input | Form | Role in the Loop | +| --- | --- | --- | +| `call_agent` | `async (str) -> str` | Passes query to business agent; optimizer samples behavior through this | +| `optimizer.json` | JSON configuration | Defines evaluation metrics (`evaluate.metrics`) and algorithm parameters (`optimize.algorithm`) | +| `TargetPrompt` | Multi-field prompt registration table | Declares which prompt files / remote configuration entries are optimization targets | +| `EvalSet Ɨ 2` | Two evalsets | Training set for reflection LM to see failure cases, validation set for scoring / early stop determination | + +Destinations of the two outputs: + +| Output | Form | Typical Use | +| --- | --- | --- | +| `OptimizeResult` | In-memory object returned by `optimize()` | Programmatic reading (baseline / best / each round details) | +| `runs//` | Audit directory | Manual review, CI parsing, re-run (see [§8](#8-artifacts-and-directory-conventions) for details) | + +### 3.2 call_agent + +**One sentence**: The "universal plug" for your business agent. + +**Why needed**: Your agent might be a local `LlmAgent`, might be a deployed HTTP service, might be a black-box CLI like `claude` / `codex`. The module cannot write adapters for every form; you only need to wrap "given a query → get the agent's final response" into an async function, and the module drives the agent to run evaluations through it. + +**How to use**: + +```python +async def call_agent(query: str) -> str: + # Your implementation: call local agent / HTTP service / subprocess CLI, all fine + # Key point: re-read prompt files each time (so GEPA's new candidates take effect immediately) + root_agent = create_agent() + runner = Runner(...) + return await run_and_collect_final_response(runner, query) +``` + +The signature is fixed as `async (str) -> str`, cannot have more parameters nor be synchronous. + +**When the framework calls it**: + +| Timing | Frequency | +|---|---| +| Baseline evaluation | Each val case Ɨ `num_runs` | +| Each round's minibatch evaluation | Each sampled case 1 time | +| Each round's candidate validation set evaluation | Each val case Ɨ `num_runs` | + +### 3.3 TargetPrompt + +**One sentence**: Tells the module "which prompt files are to be optimized", equivalent to an **optimization target registration table**. + +**Why needed**: In agent projects, prompts are usually scattered across multiple files or even multiple backends (system.md / skill.md / also placed in QCS versions); the module needs to know: **when a new candidate is reflected, where should it be written, and where should it read from when reading baseline**. `TargetPrompt` is this "address book". + +**How to use**: + +```python +from trpc_agent_sdk.evaluation import TargetPrompt + +target = ( + TargetPrompt() + .add_path("system_prompt", "agent/prompts/system.md") # File type + .add_path("skill", "agent/prompts/skill.md") # File type + .add_callback("rule", # Callback type (remote KV) + read=load_rule_from_kv, + write=save_rule_to_kv) +) +``` + +Each field `name` (e.g., `"system_prompt"`) will become, after optimization ends: + +- `result.best_prompts["system_prompt"]` — programmatic reading of optimal prompt +- `runs//best_prompts/system_prompt.md` — human reading of optimal prompt +- Elements in `RoundRecord.optimized_field_names` — see which field was changed each round + +**Two types of sources**: + +| Source | Applicable When | What the Framework Does | +|---|---|---| +| `add_path(name, path)` | Prompt is in local file | Write to disk using tmp + `os.replace` atomic write; multi-field failure rolls back source files | +| `add_callback(name, *, read, write)` | Prompt is in remote configuration center / database / git, etc., any backend | Calls your `read` / `write` async functions; atomicity is guaranteed by you | + +See [§7.2](#72-targetprompt-api-table) for the complete API. + +### 3.4 AgentOptimizer + +**One sentence**: The module's "power button". + +**Why needed**: You wouldn't want to manually write the whole process of "read config → validate inputs → run reflection loop → persist to disk → assemble result"; `AgentOptimizer` encapsulates this process into one call—you give it **inputs**, it returns **results**. + +**How to use**: + +```python +from trpc_agent_sdk.evaluation import AgentOptimizer + +result = await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs/2026-05-19T17-00-00", +) +print(result.best_pass_rate) +``` + +This module has only this one public entry point, **no other way to start optimization**. + +**What it does**: + +1. Loads and validates `optimizer.json` (throws error before running if schema is wrong) +2. Validates `call_agent` is async function / `target_prompt` has at least one registered field / training set ≠ validation set +3. Runs reflection-evaluation-retention loop +4. Persists artifacts to `output_dir/` +5. Returns an `OptimizeResult` object + +`optimize` has 11 keyword-only parameters in total; the 6 commonly used ones are in [§2.4](#24-core-code), all parameters see [§7.1](#71-agentoptimizeroptimize-parameter-table). + +**`update_source` decision table** (key parameter shared by all §4.x scenarios): Determines whether to **write back** the optimal candidate to the source prompt files registered in `TargetPrompt` after successful optimization— + +| `update_source` | What to do after success | Effective Path | Applicable Scenario | +|---|---|---|---| +| `False` (default) | Only write the optimal candidate to `output_dir/best_prompts/` | You **manually** review → copy to online prompt file → takes effect on next call | Grayscale deployment, requires manual review, don't want optimizer to directly modify online files | +| `True` | Directly **overwrite** source prompt files with the optimal candidate | Business next call **immediately** uses the new prompt | Automated closed loop (e.g., night optimization task, see [§4.6 CI Closed Loop](#46)) | + +Regardless of which you choose, the business side requires **zero restart, zero code changes**—the way to perceive prompt changes is always "re-read file on next call". + +> Safety guarantee of `update_source=True`: Overwrite uses tmp + `os.replace` atomic write; if optimization is interrupted midway or by SIGINT, the source prompt file **will not be half-written**, preserving original content (see [§8.3 Atomic Disk Persistence](#83-atomic-disk-persistence-guarantee) for details). + +### 3.5 optimizer.json + +**One sentence**: A configuration file that tells the module "what counts as good" and "how to search". + +**Why needed**: Metric thresholds, minibatch size, reflection LM configuration, stop conditions... if these parameters are scattered in code, you need to modify code every time you run an experiment. After centralizing to one JSON file, tuning parameters = modify JSON, and reproducibility is also better (a copy of `config.snapshot.json` will be saved in the artifacts). + +**What it looks like**: [§2.5](#25-configuration-file-optimizerjson) already showed the complete example. Structurally divided into two sections: + +```text +{ + "evaluate": { ... }, # Same schema as AgentEvaluator: metric list + num_runs + "optimize": { + "eval_case_parallelism": 2, + "stop": { # Framework-level stop: which metrics must reach threshold + "required_metrics": "all" + }, + "algorithm": { # Algorithm-specific: reflection_lm / minibatch / 6 types of stoppers + "name": "gepa_reflective", + ... + } + } +} +``` + +**Division of labor between the two sections**: + +- `evaluate` section: **completely reuses** the evaluation module's schema. Metric configurations you wrote for evaluation projects can be directly copied over +- `optimize` section: **optimizer-specific**. Among them, `algorithm.name` is the algorithm selector; currently the only optional value is `"gepa_reflective"`, will be extended by [§9.2 Registering New Algorithms](#92) when new algorithms are added in the future + +See [§7.3](#73-optimizerjson-configuration-items-table) for the complete field table. + +### 3.6 EvalSet / EvalCase + +**One sentence**: Training set + validation set, format identical to the evaluation module. + +**Why need two separate files**: + +- **Training set**: The module randomly **samples** a few cases from it each round (`reflection_minibatch_size`, default lets gepa decide) for the reflection LM to see failure cases → used to "find improvement directions" +- **Validation set**: After each new candidate is generated, **run fully** on it for scoring → used to "verify whether the candidate is actually better" + +**Why must they be different files**: The training set determines what the reflection LM sees, the validation set determines whether a candidate is accepted. If the two overlap, it becomes "using exam questions for practice, then using exam questions for grading"—the resulting best_pass_rate is not credible. The framework validates at startup by comparing paths (`os.path.normpath(os.path.abspath(...))`) to defend against this, and directly throws `ValueError` if they overlap. + +See [Evaluation Set Writing Guide](evaluation.md#evaluation-set-evalset-writing-guide) for format and writing guidelines. + +### 3.7 OptimizeResult + +**One sentence**: The "complete output" after one optimization run, both the return value of `optimize()` and the content of `runs//result.json`. + +**Why needed**: After running optimization, you care most about three things—success or not / how much improvement / what is the optimal prompt. `OptimizeResult` packages them: + +```python +result = await AgentOptimizer.optimize(...) + +# 1. Success or not +if result.status == "SUCCEEDED": + ... + +# 2. How much improvement +print(f"{result.baseline_pass_rate:.2%} → {result.best_pass_rate:.2%}, " + f"+{result.pass_rate_improvement:.2%}") + +# 3. What is the optimal prompt +new_system_prompt = result.best_prompts["system_prompt"] +new_skill = result.best_prompts["skill"] +``` + +It also carries process data (what happened each round, reflection LM call count, total duration, etc.) for post-hoc analysis. + +**The 6 most frequently viewed fields**: + +| Field | Type | Meaning | +|---|---|---| +| `status` | `"SUCCEEDED"` / `"FAILED"` / `"CANCELED"` | Final state | +| `baseline_pass_rate` / `best_pass_rate` | `float` | Pass rate before / after optimization | +| `pass_rate_improvement` | `float` | Difference between the two | +| `best_prompts` | `dict[str, str]` | Field name → optimal prompt text | +| `rounds` | `list[RoundRecord]` | Each round's record | +| `stop_reason` | `Literal[...]` or `None` | Which stopper triggered the stop | + +See [§7.4](#74-optimizeresult--roundrecord-field-table) for all 22 fields (including `RoundRecord`). + +### 3.8 Reflection LM + +**One sentence**: The LLM used internally by the module, which receives a set of failure cases each round and outputs improved prompt candidates; it is a separate configuration from the business LM used by your agent. + +Configured in the `optimizer.json::optimize.algorithm.reflection_lm` section, type is `OptimizeModelOptions`: + +```json +"reflection_lm": { + "model_name": "gpt-4o", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-...", + "generation_config": {"temperature": 0.6, "max_tokens": 4096} +} +``` + +See [§6.5](#65-reflection-lm-selection-suggestions-table) for model selection suggestions; see [§7.3.3](#733-optimizealgorithm-section) for complete fields. + +## 4 Your Scenario → How to Integrate + +| Your Situation | Section | Corresponding Example | +|---|---|---| +| Agent is an online HTTP service (FastAPI / Gin / self-developed interface) | [§4.1](#41) | `http_service` | +| Agent is a subprocess / command-line tool (`claude` / `codex` / internal CLI) | [§4.2](#42) | `blackbox_cli` | +| Agent is a multi-sub-agent chain (multiple sub-agents collaborate to complete one response), want to optimize each sub-agent's prompt simultaneously | [§4.3](#43) | `multi_agent_pipeline` | +| Prompts are not in local files, stored in remote KV / configuration center / database / Git, etc., any backend | [§4.4](#44) | `remote_prompt_store` | +| Single evaluation metric is insufficient, need to run multiple evaluation metrics simultaneously (e.g., answer accuracy + hallucination rate + style compliance rate) and fuse into a total score | [§4.5](#45) | `multi_metric_with_judges` | +| Want to integrate CI closed loop: run evaluation gate on PR, run optimization in night window and automatically write back new prompts | [§4.6](#46) | `ci_integration` | +| Optimization task has hard constraints (e.g., must complete within 1-hour window / cumulative calls not exceeding N / stop after consecutive no-improvement) | [§4.7](#47) | `slo_runtime_control` | +| Can already run through the basic process, want to further improve results (adjust GEPA candidate selection / Pareto frontier / cross-field fusion) | [§4.8](#48) | `advanced_strategies` | +| Other common extensions (connect Grafana / WandB, etc. for monitoring, custom stop strategy, use your own optimization algorithm) | [§4.9](#49) | (Multiple examples combined) | + +### 4.1 My Agent is an HTTP Service, How to Integrate? {#41} + +**Your situation**: The business agent is already online as an independent service (FastAPI / Gin / self-developed framework are all acceptable), hoping to perform automatic optimization on its prompts—but the service runs long-term and cannot stop, service implementation details are a black box to the optimizer, and prompts are usually injected in file form. + +**Integration model**: The optimizer accesses as a **pure client**, with only **one coupling point** with the service process—the prompt files on disk. + +```text ++-------------------+ HTTP request + query +-------------------+ +| AgentOptimizer | --------------------------------> | HTTP agent | +| (optimizer) | <--------- text response -------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | Each request + v | re-reads prompt + +------------------------------------------------------------+ + | prompt files (on disk) | + +------------------------------------------------------------+ +``` + +The service process **does not need any code changes**, only needs to satisfy one convention: **re-read prompt files before processing each request**—so that the new candidate written by the optimizer takes effect on the next request. + +**Integration in 3 steps**: + +**Step 1: Register `TargetPrompt` on the prompt files read by the HTTP service** + +```python +target = TargetPrompt().add_path("system_prompt", "service/prompts/system.md") +``` + +The second parameter of `add_path` must be **the exact file path that the service process actually reads** (not an arbitrary copy), otherwise the new candidate written by the optimizer will not be perceived by the service. + +**Step 2: Write `call_agent` as an HTTP client to the service** + +```python +async def call_agent(query: str) -> str: + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post("http://my-agent-service/chat", + json={"query": query}) + resp.raise_for_status() + return resp.json()["final_text"] +``` + +Modify the `json=...` field according to the actual interface payload schema of the business; adjust `timeout` according to the business's first inference latency (example default 120s). + +**Step 3: Call `AgentOptimizer.optimize`** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir=f"runs/{timestamp}", + update_source=False, # Decision table see [§3.4](#34-agentoptimizer) +) +``` + +**Pre-integration checklist**: + +| Check Item | Description | +|---|---| +| Does the service re-read prompt files on each request? | No → New candidates written by optimizer won't be seen by the service, optimization is ineffective. Need to add re-read logic in the handler | +| Does the optimizer process have write permission to prompt files? | No → Optimizer cannot persist new candidates | +| Are the prompt file paths seen by the service and the optimizer consistent? | Especially need to confirm in containerized deployment (mount path / symlink) | +| What is the service's 5xx behavior? | The service should not silently retry internally—this would mask the real failure rate, letting the optimizer see a false "high score" | + +**→ Complete example**: [`examples/optimization/http_service/`](../../../examples/optimization/http_service/) +- `service/server.py` — Demonstrates FastAPI service with prompt hot-loading (`/chat` rebuilds agent and re-reads `system.md` each time), can be used as a reference for business service transformation +- `run_optimization.py` — Client optimizer entry, includes pre-start service health check (fail-fast) + +### 4.2 My Agent is an External Command-Line Tool (CLI), Optimizer Cannot Get Its Code {#42} + +**Your situation**: The business agent is an external executable program—`claude` / `codex` / self-developed CLI, etc. Its source code, internally used LLM client, and runtime language are **completely black boxes** to the optimizer, but it reads several prompt files from a working directory at startup (typically `CLAUDE.md` + `.claude/skills//SKILL.md`). You hope to optimize these prompt files without modifying the CLI code or binding to any of its internal dependencies. + +**Integration model**: The optimizer calls the CLI through **subprocess**, and the **only coupling point** with the CLI is still the prompt files on disk—this is the same structure as §4.1's HTTP service, the difference is only replacing "HTTP request" with "starting a subprocess". + +```text ++-------------------+ start subprocess + pass query +-------------------+ +| AgentOptimizer | --------------------------------> | External CLI | +| (optimizer) | <--------- stdout text ---------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | Each startup + v | auto-reads + +------------------------------------------------------------+ + | prompt files (on disk) | + +------------------------------------------------------------+ +``` + +The CLI binary itself **does not need any modifications**, only needs to satisfy: **it loads prompt files from the specified directory on each startup** (most CLI tools are designed this way). + +**Integration in 3 steps**: + +**Step 1: Register `TargetPrompt` on the prompt files read by the CLI (use `add_path` multiple times for multiple files)** + +```python +target = ( + TargetPrompt() + .add_path("claude_md", "workspace/CLAUDE.md") + .add_path("skill_md", "workspace/.claude/skills/city-info/SKILL.md") +) +``` + +Each `add_path` registers one independent field; GEPA treats each field as an independently optimizable module, can optimize separately/jointly (see §3.7, §4.3 for details). + +**Step 2: Wrap subprocess call + stdout normalization into `call_agent`** + +```python +async def call_agent(query: str) -> str: + proc = await asyncio.create_subprocess_exec( + "trpc-claudecode", "--print", + "--add-dir", str(WORKSPACE_DIR), # CLI loads prompt files from here + "--dangerously-skip-permissions", + query, # Pass query as argv, avoid shell escaping + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_build_cli_env(), # Environment variables expected by business's own CLI + ) + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), timeout=90.0, # Prevent single CLI from hanging + ) + if proc.returncode != 0: + raise RuntimeError(f"CLI exited {proc.returncode}: {stderr_b[:400]!r}") + return _normalize_response(stdout_b.decode("utf-8", "replace")) +``` + +`call_agent` still has the standard signature `async (query: str) -> str` from §3.1; to the optimizer main loop, this `call_agent` is no different from "calling local LLM". `_build_cli_env` / `_normalize_response` are helper functions implemented by the business according to their CLI's characteristics (the former modifies/supplements environment variables to the form expected by the CLI, the latter normalizes CLI stdout into a stable string comparable for evaluation)—this framework does not prescribe their form, implement as needed. + +**Step 3: Run once to confirm baseline works, then hand over to GEPA reflection optimization** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, +) +``` + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Does the CLI re-read prompt files on each startup? | No → New candidates written by optimizer won't take effect; evaluation between candidates is equivalent to running the same baseline | +| Does the CLI support passing query through argv / stdin / `--query xxx`? | No → Integration is not feasible (need to add this entry point to CLI first) | +| Is the CLI's average single-run latency known? | No → Cannot reasonably set `CLI_TIMEOUT_SEC` and `max_metric_calls` | +| Does the CLI process pollute shared disk state (other than prompt files)? | Yes → Evaluation is not reproducible; need `eval_case_parallelism=1` or independent workspace for each case | + +**→ Complete example**: [`examples/optimization/blackbox_cli/`](../../../examples/optimization/blackbox_cli/) +- `agent/call_agent.py` — Subprocess call + environment variable adaptation + stdout normalization engineering implementation, can be used as a starting point for integrating your own CLI +- `run_optimization.py` — Standard entry for dual-field (`CLAUDE.md` + `SKILL.md`) `TargetPrompt` + +### 4.3 My Agent is a Multi-Sub-Agent Chain, Want to Optimize Each Sub-Agent's Prompt Simultaneously {#43} + +**Your situation**: The business side has already orchestrated a multi-sub-agent collaboration chain. Each sub-agent has its own system prompt, and there are implicit contracts between fields (the output form of upstream sub-agent must match downstream expectations). Common symptoms during manual iteration are **"fixing A shows effect, but drags down B"**. You hope to **jointly optimize** prompts for all sub-agents, so that end-to-end metrics improve. + +**Integration model**: Register each sub-agent's prompt file as an **independent field** of `TargetPrompt`—GEPA treats each field as an independently optimizable module (component), selects 1 or more fields to write back each round according to `module_selector`, and the optimizer only looks at the end-to-end metric score as feedback. The chain code requires **zero modifications**; each sub-agent just needs to re-read its own prompt file each time it is called. + +```text ++-----------------------------+ select 1 field each round +---------------------+ +| AgentOptimizer | --------------------------> | prompt files | +| (multi-field TargetPrompt) | write back new candidate | (each sub-agent | +| | | has 1 file) | ++--------------+--------------+ +----------+----------+ + ^ | + | End-to-end metric score | Each call + | | re-reads prompt + | v + | +-----------------------------------------+ + +------------- | call_agent(query) | + | = Your multi-sub-agent chain | + | call entry | + | (sub-agent A → sub-agent B → ...) | + +-----------------------------------------+ +``` + +**Integration in 3 steps**: + +**Step 1: Register each sub-agent's prompt file as an independent field** + +```python +target = ( + TargetPrompt() + .add_path("agent_a", ".md") + .add_path("agent_b", ".md") + # ... one add_path per sub-agent +) +``` + +The key is the identifier of this field in reflection prompts / artifact filenames; it just needs to be readable by the business. + +**Step 2: Wrap the entire chain call into `call_agent`, and ensure sub-agents re-read prompts each time** + +```python +async def call_agent(query: str) -> str: + return await invoke_pipeline(query) # Your existing chain entry +``` + +Key constraint inside `invoke_pipeline`: **each sub-agent must re-read its own prompt file each time it is called**, otherwise new candidates written by the optimizer will not take effect. + +**Step 3: Turn on multi-field related switches in `optimizer.json`** + +```jsonc +{ + "optimize": { + "algorithm": { + "module_selector": "round_robin", // Select 1 field per round in rotation, convenient for attribution + "use_merge": true, // Actively fuse after accumulating several single-field improvements + "max_merge_invocations": 3, + "reflection_history_top_k": 3 // Recommended to increase when multi-field rotation (default 2) + } + } +} +``` + +See [§7 Complete API Reference](#7-complete-api-reference) for the complete semantics and value mappings of each parameter. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Does each sub-agent re-read its own prompt file each time it is called? | No → New candidates written by optimizer won't take effect; evaluation between candidates is equivalent to running the same baseline | +| Can end-to-end metrics reflect the joint quality of all fields? | No → Feedback signal seen by reflection LM is not real; recommend using `final_response_avg_score` to evaluate final response | +| How many LLM inferences does a single case go through? | Call volume multiplies by chain depth; need to correspondingly reduce `eval_case_parallelism` / `reflection_minibatch_size` to prevent rate limit | +| Do sub-agents need to be in the same process? | Not necessary—`call_agent` internals can be HTTP / gRPC / internal SDK / other orchestration frameworks; as long as it ultimately returns `str` | + +**→ Complete example**: [`examples/optimization/multi_agent_pipeline/`](../../../examples/optimization/multi_agent_pipeline/) +- `pipeline/orchestrator.py` — Multi-sub-agent chain implementation, sub-agents re-read prompts on each call +- `run_optimization.py` — Standard entry for multi-field `TargetPrompt` +- `optimizer.json` — Recommended configuration for multi-field scenarios + +### 4.4 My Prompts Are Not in Local Files, Stored in Remote Configuration Center / KV / Database {#44} + +**Your situation**: Business prompts are not in local files, but placed in a remote configuration center (QCS / Apollo / Nacos / self-developed KV / database / Git, etc.), and the business fetches and uses them from the center. The optimizer cannot directly access the file system—it can only interact with the remote through the business's own SDK. + +**Integration model**: `TargetPrompt` abstracts "where prompts are" into a pair of async functions `read` / `write`—the optimizer calls `read` to get the baseline snapshot, calls `write` to persist candidates; the remote backend form (KV / RPC / SQL / Git API ...) is **completely black box** to the optimizer. This is isomorphic to the structure coupled through local prompt files in §4.1 / §4.2, the difference is only replacing "read/write files" with "calling two async functions given by the business". + +```text ++-------------------+ async read / write +---------------------+ +| AgentOptimizer | <-------------------------------> | Remote config | +| (optimizer) | (your own SDK / HTTP / RPC) | (KV / DB / Git ...)| ++---------+---------+ +---------+-----------+ + ^ | + | best_prompts/ persisted locally | Business calls + | | pulls config + v v + +-------------------+ +---------------------------+ + | output_dir/ | | call_agent internals | + | best_prompts/ | | Pull latest prompt then | + +-------------------+ | call agent | + +---------------------------+ +``` + +**Integration in 3 steps**: + +**Step 1: Implement a pair of async functions to operate remote prompts** + +```python +async def read_prompt() -> str: + return await your_config_sdk.get(key="system_prompt") + +async def write_prompt(value: str) -> None: + await your_config_sdk.put(key="system_prompt", value=value) +``` + +Signature constraints: `read: async () -> str`, `write: async (str) -> None`. Retry / idempotency / authentication are guaranteed by the business's own SDK. + +**Step 2: Use `add_callback` instead of `add_path` to register `TargetPrompt`** + +```python +target = TargetPrompt().add_callback( + "system_prompt", + read=read_prompt, + write=write_prompt, +) +``` + +`add_callback` and `add_path` are peers on `TargetPrompt`—multi-field can also be mixed (some fields in local files, some fields in remote configuration center). + +**Step 3: Write `call_agent` as "pull now, use now", call `optimize` as usual** + +```python +async def call_agent(query: str) -> str: + prompt_text = await read_prompt() # Pull now, ensure candidate writes take effect immediately + agent = create_agent(prompt_text) + return await runner.run_async(query, ...) + +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, # Decision table see §3.4 +) +``` + +The value of `update_source` is determined by the business side's prompt write-back strategy (see §3.4 decision table for details), the framework has no additional restrictions on it. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Does the business side re-pull configuration on each call? | No → After optimizer writes new candidate, business cannot perceive it, reflection loop fails | +| Are both `read` / `write` async functions? | No → Error reported immediately when registering with `add_callback` | +| Is `write` idempotent (accepts repeated writes of the same value)? | No → May fail when automatically rolling back to baseline at finish, leaving remote contaminated | +| Does the optimizer process have write permission for this key / namespace? | No → `write` throws permission error, current candidate evaluation fails | + +> **Safe mode involving production prompts** (adopt as needed, not forced by framework): If the business side already has sandbox / production namespace isolation, you can let the optimizer only read/write sandbox keys, cooperate with `update_source=False` to let the optimizer automatically roll back sandbox at finish, the best candidate is only persisted locally in `best_prompts/`, then synchronized to production through the business's own approval flow. `examples/optimization/remote_prompt_store/` demonstrates this workflow. + +**→ Complete example**: [`examples/optimization/remote_prompt_store/`](../../../examples/optimization/remote_prompt_store/) +- `store/prompt_client.py` — `read` / `write` async function definitions, core transformation point for integrating business configuration center SDK +- `run_optimization.py` — Standard entry for `add_callback` registration (demonstrates workflow using sandbox + `update_source=False` + manual approval) + +### 4.5 Single Evaluation Metric Is Insufficient, Need Multiple Metrics and Fuse into Total Score {#45} + +**Your situation**: Business launch has requirements for agent output in more than one dimension—answer must be correct (correctness hard constraint) + must not talk nonsense (hallucination rate) + style must comply with specifications (format / tone) + must not contain sensitive words (compliance)... Single metric cannot contain all, forcibly using a single composite metric means the feedback signal seen by the reflection LM is a mixed scalar, making it difficult to attribute directionally. + +**Integration model**: `optimizer.json`'s `evaluate.metrics` is a **list**—directly list multiple metrics, each scored independently, with independent threshold and independent configuration. Early stop determination declares which metrics must reach the threshold through `optimize.stop.required_metrics`; GEPA internally decides how to maintain the Pareto frontier among multiple metrics through `optimize.algorithm.frontier_type` to avoid "fixing A drags down B". The entire mechanism is purely configuration-driven—`call_agent` and `TargetPrompt` both do not need to change a single line of code for multi-metric. + +**Configuration in 3 steps**: + +**Step 1: List all metrics in `evaluate.metrics`** + +```jsonc +{ + "evaluate": { + "num_runs": 2, // Smooth LLM output variance (>1 lets each case run multiple times and take mean) + "metrics": [ + { + "metric_name": "llm_final_response", // Hard constraint: is answer substantively equivalent to reference + "threshold": 1.0, + "criterion": { "...": "..." } // Complete fields see §7 / example + }, + { + "metric_name": "llm_rubric_response", // Soft constraint: multiple rubrics (format / style / units ...) + "threshold": 0.75, + "criterion": { "...": "..." } + } + ] + } +} +``` + +Each metric is scored independently and written independently to `metric_breakdown` in `result.json`, convenient for reverse-attributing which metric a certain evaluation lost points on. + +**Step 2: Declare early stop gate in `optimize.stop.required_metrics`** + +| Value | Semantics | Applicable Scenario | +| --- | --- | --- | +| `"all"` | Early stop only when all metrics reach threshold | All metrics are must-pass items | +| `["m1", "m2"]` | Early stop only when all metrics in the list reach threshold (other metrics still participate in evaluation but do not affect early stop) | Some metrics are reference observation items, not used as gates | +| `null` or `[]` | Does not participate in early stop, only controlled by algorithm-level budget / no-improvement / score_threshold | Just want to run out the budget and see results | + +**Step 3: Adjust `frontier_type` to a value that correctly handles multiple metrics** + +| Value | Meaning | Applicable | +| --- | --- | --- | +| `instance` | Maintain one best candidate per case | Single metric or no obvious conflict between metrics | +| `objective` | Maintain one best candidate per metric | Multiple metrics but small case count | +| `hybrid` | Maintain both case + metric two-layer frontier | **Real conflict scenario with multiple metrics** (recommended default) | +| `cartesian` | One best candidate per (case, metric) combination | Extremely complex / debugging use, candidate pool easily explodes | + +`hybrid` lets GEPA not lose the best candidate on another metric when improving one metric—the **safe default for multi-metric business**. See [§7](#7-complete-api-reference) for the complete definition of each value. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Do the `threshold` values of each metric conform to business requirements? | No → Early stop determination is inaccurate; business-critical metrics may not have reached standard when optimization ends | +| Are only "hard constraints" listed in `stop.required_metrics`? | No → Soft constraint fluctuations will repeatedly interrupt early stop determination, wasting budget | +| Does `eval_case_parallelism` consider the concurrency of metric count Ɨ judge count? | No → Single-round LLM call volume explodes (N cases Ɨ M metrics Ɨ K judges Ɨ `num_runs`), easily hitting LLM backend rate limit | +| Is `num_runs` reasonable (default 1)? | Single LLM judge output has variance; recommend `num_runs=2` to let each case run twice and take mean to eliminate jitter | + +**→ Complete example**: [`examples/optimization/multi_metric_with_judges/`](../../../examples/optimization/multi_metric_with_judges/) +- `optimizer.json` — Complete configuration example with `llm_final_response` (multi-judge `all_pass` voting) + `llm_rubric_response` (single judge multi-rubric) + `frontier_type=hybrid` + `stop.required_metrics` list style +- `run_optimization.py` — Standard entry consistent with single-metric scenarios (multi-metric does not affect entry code) + +### 4.6 Want to Integrate CI Closed Loop: PR Gate + Night Optimization Auto Write-Back {#46} + +**Your situation**: You hope prompt engineering also follows the CI/CD process—each PR automatically runs evaluation gate (score below threshold means CI red light, preventing degraded prompts from entering main branch), while simultaneously running reflection optimization in a low-peak window to write back better prompts, and the next PR automatically uses them. **Using either link alone is not enough**: pure gate will not automatically make prompts better, pure optimization has no quality gate. + +**Integration model**: `AgentEvaluator.evaluate` (pytest runs PR gate) and `AgentOptimizer.optimize` (night optimization) share **the same set of assets**—the same `call_agent`, the same evalset (physically split into train / val two files to prevent leakage, logically one set of corpus), the same pair of prompt files. `update_source=True` is the key switch for the closed loop: after optimization succeeds (`OptimizeResult.status=SUCCEEDED`), the optimal candidate directly overwrites the source prompt files, and the next PR-triggered pytest automatically reads the new content. + +```text + +-----------------------------------------------------+ + | Shared assets: call_agent + evalset + prompt files | + +------+----------------------------------------+-----+ + | | + Trigger: PR | | Trigger: Night window + v v + +---------------------------+ +---------------------------+ + | AgentEvaluator.evaluate | | AgentOptimizer.optimize | + | (pytest runs) | | update_source=True | + | | | | + | Score < threshold → Red | | Success → Overwrite | + | pytest exit != 0 → | | source prompts | + | Block PR | | Failure → Files unchanged| + +---------------------------+ +-------------+-------------+ + | + v + Next PR automatically + uses new prompts + (Forms "eval→optimize→eval" + evolution closed loop) +``` + +**Integration in 3 steps**: + +**Step 1: Extract `call_agent` into a module shared by evaluate / optimize** + +```python +# agent/agent.py (both pytest and optimizer import from here) +async def call_agent(query: str) -> str: + ... +``` + +**Why must share**: The agent used during evaluation and the agent used during optimization must be **equivalent**—otherwise "optimizer found a good prompt that evaluator cannot verify" or the reverse problem will occur. Sharing the same `call_agent` file is the most direct code-level guarantee. Any agent changes (model switch / temperature adjustment / output schema change) only need to be changed in one place. + +**Step 2: Write pytest entry for PR gate** + +```python +# tests/test_agent_quality.py +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator +from agent.agent import call_agent + +@pytest.mark.asyncio +async def test_agent_quality(): + await AgentEvaluator.evaluate( + call_agent=call_agent, + eval_set_path="data/val.evalset.json", + test_config_path="optimizer.json", # Reuse same metric configuration + ... + ) # Framework throws AssertionError when score is below threshold → pytest red +``` + +Run in CI pipeline: + +```bash +pytest tests/ --junitxml=runs/pytest_report.xml +``` + +The `--junitxml` output is a standard format test report, parsed natively by mainstream platforms like GitHub Actions / BlueKing Pipeline / Tencent CI. When failing, the `AssertionError` message contains the failure details JSON for each case; when the CI platform displays the stack trace, it can directly see which case failed, what the agent actually output, and where the difference from expected is. + +**Step 3: Night window runs optimization + `update_source=True`** + +```python +# run_optimization.py (triggered by night cron) +await AgentOptimizer.optimize( + config_path="optimizer.json", # Same metric configuration as pytest + call_agent=call_agent, # Same call_agent as pytest + target_prompt=target, + train_dataset_path="data/train.evalset.json", + validation_dataset_path="data/val.evalset.json", + output_dir="runs/optimize_/", + update_source=True, # Key switch for CI closed loop +) +``` + +Safety guarantee of `update_source=True`: Source prompt files are only written back when `OptimizeResult.status=SUCCEEDED`; source files remain unchanged in other states such as failure / budget exhaustion. Overwrite uses atomic write (tmp + `os.replace`), midway exceptions / SIGINT will not corrupt source prompt files (see [§8.3](#83-atomic-disk-persistence-guarantee) for details). + +It is recommended to add `git diff --quiet agent/prompts/` at the end of the night script to determine if there are changes; exit directly if no changes; if there are changes, then `git checkout -b ...` + automatically open a PR—letting new prompts go through the standard PR review process instead of directly entering main branch. + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Is `call_agent` **the same code** shared by pytest and optimizer? | No → Agent for evaluation and agent for optimization are not equivalent; optimization direction and gate direction drift | +| Do pytest and optimizer use **the same metric configuration**? | No → "Evaluation can pass but optimizer sees low score" or the reverse problem. Recommend reusing through `test_config_path` in pytest for the `optimizer.json.evaluate` section | +| Is evalset physically split into train / val two files? | No → SDK `_validate_inputs` forcibly validates `train != val`, otherwise reports error fail-fast | +| Does the night script have `git diff` + automatic PR opening steps at the end? | No → Optimized prompts directly enter main branch, bypassing review; recommend always going through PR process | +| Is there a grayscale strategy for prompt changes ready? | When multiple business lines share the same prompt repository, recommend switching to `update_source=False` + business's own grayscale deployment tool | + +**→ Complete example**: [`examples/optimization/ci_integration/`](../../../examples/optimization/ci_integration/) +- `agent/agent.py` — `call_agent` shared by pytest and optimizer +- `tests/test_agent_quality.py` — pytest gate entry (called at PR stage) +- `run_optimization.py` — Night optimization entry (`update_source=True`) +- `ci/run_pr_check.sh` / `ci/run_nightly_optimize.sh` — CI pipeline shell entries + +### 4.7 Optimization Task Has Hard Constraints: Must Complete Within a Time Window / Cumulative Calls Not Exceeding N / Stop After Consecutive No-Improvement {#47} + +**Your situation**: Your optimization task runs in a constrained environment—CI pipeline must end within N minutes, LLM backend quota is calculated monthly and single run cannot exhaust it, should actively give up after several consecutive rounds without improvement. **Single stop condition is not enough**: only setting timeout may stop before budget is used up, only setting budget may run until the end of time. You need a multi-stop strategy of "stop immediately when any SLO triggers". + +**Integration model**: The `optimize.algorithm` section of `optimizer.json` provides 6 algorithm-level stop conditions, with **OR semantics**—stop immediately when any one triggers. You reverse-calculate each threshold according to business SLO, and enable multiple switches simultaneously. When optimization ends, the `OptimizeResult.stop_reason` field tells you which SLO triggered first, convenient for subsequent parameter tuning. + +**Configuration in 3 steps**: + +**Step 1: Select several stop conditions that the business cares about from the 6 types** + +| Field | Trigger Condition | Typical Business Scenario | +| --- | --- | --- | +| `timeout_seconds` | Wall-clock exceeds N seconds | CI pipeline time window hard constraint (must end within N minutes) | +| `max_metric_calls` | Cumulative case evaluation count ≄ N | LLM backend quota hard upper limit | +| `max_candidate_proposals` | Reflection LM cumulative proposal count ≄ N | Limit reflection LM call budget | +| `max_iterations_without_improvement` | N consecutive rounds without best valset improvement | Actively give up when already converged or trapped in local optimum | +| `score_threshold` | Best valset pass_rate ≄ threshold | Already reached business goal, no need to continue | +| `max_tracked_candidates` | Pareto frontier candidate pool size ≄ N | Control memory and merge candidate space size | + +See [§7.3.3](#733-optimizealgorithm-section) for the complete definition of each field. **Configure at least 1**—otherwise the framework reports fail-fast at startup. + +**Step 2: Reverse-calculate each threshold according to business SLO** + +```jsonc +{ + "optimize": { + "algorithm": { + "timeout_seconds": 90.0, // CI must end within X minutes → set X*60 / 2 to leave buffer + "max_metric_calls": 30, // LLM quota → reverse-calculate by "calls Ɨ single-run duration" + "max_iterations_without_improvement": 3, // Give up after 3 consecutive rounds without improvement + "score_threshold": 1.0 // Stop when business goal is reached + } + } +} +``` + +**Two key reverse-calculations**: + +| Item | How to test | How to reverse-calculate | +| --- | --- | --- | +| Typical single-round duration | Run a baseline, look at `rounds[*].durationSeconds` in `runs//result.json` (take median) | `timeout_seconds` should be at least single-round duration Ɨ 2, otherwise the first round triggers stop and you cannot see optimization progress | +| Single-round metric_calls count | Same as above, look at `totalMetricCalls / totalRounds` in round | `max_metric_calls` should be able to run through at least `max_iterations_without_improvement` rounds, otherwise budget always triggers stop first | + +**Step 3: Clarify whether to participate in framework-level metric early stop** + +| Value | Semantics | +| --- | --- | +| `optimize.stop.required_metrics: "all"` or `["m1"]` | Metric reaching threshold also participates in OR trigger | +| `optimize.stop.required_metrics: []` | Only let the 6 algorithm-level stoppers decide | + +Business requirements: +- **Care about whether metrics reach standard** (typical prompt quality optimization) → use `"all"` or specific list +- **Only care about time / call budget** (known to converge, purely carding resources) → use `[]` + +**`stop_reason` value reference**: When optimization ends, the `OptimizeResult.stop_reason` value can tell you the trigger—`score_threshold_reached` / `budget_exhausted` / `timeout_reached` / `no_improvement` / `max_proposals_reached` / `max_tracked_candidates_reached` / `user_requested_stop` (user actively triggers through `optimize.stop` sentinel file). + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Are thresholds all reverse-calculated through baseline measurements, not intuited? | No → Highly likely some stopper always triggers first (e.g., timeout triggers in round 1), other configurations are decoration | +| Does `timeout_seconds` leave buffer (≤ 50% of real business window)? | No → Under the framework's "complete current round then stop" semantics, actual termination time may exceed the timeout set value, hitting business hard deadline | +| Do single-round LLM calls have their own timeout (e.g., CLI / HTTP calls)? | No → Single round hangs, entire timeout can only wait for current round to finish, may seriously exceed timeout (refer to CLI_TIMEOUT_SEC pattern in §4.2) | +| Have you run a baseline in the test environment once to verify `stop_reason` is consistent with expectations? | No → Only discover stopper behavior is inconsistent with expectations after going to CI, cannot quickly diagnose | + +**→ Complete example**: [`examples/optimization/slo_runtime_control/`](../../../examples/optimization/slo_runtime_control/) +- `optimizer.json` — Configuration example with all 6 stop conditions enabled (business real integration should reverse-calculate thresholds according to own SLO, do not directly copy example values) +- `run_optimization.py` — After running, `result.json.stop_reason` field identifies the trigger + +### 4.8 Can Already Run Through Basic Process, Want to Further Improve Results (GEPA Candidate Selection / Pareto Frontier / Cross-Field Fusion) {#48} + +**Your situation**: You have already run through the basic optimization process according to quickstart, and can stably see score improvement from baseline → best. Now you want to understand several advanced switches of GEPA—`candidate_selection_strategy` / `frontier_type` / `use_merge` / `skip_perfect_score`—whether they are **actually useful on your task, whether they can squeeze out a few more points**. But running optimization once often cannot see the difference, because GEPA can converge to similar `best_pass_rate` on most tasks—**the difference is hidden in the arrival path** (round count / acceptance rate / whether merge triggered / reflection LM call count), not in the final score. + +**Integration model**: Use **A/B controlled experiment**—same business, same evalset, same `seed`, run two different `optimizer.json`: one is the current online configuration or default configuration (baseline), one is the advanced combination to be verified. After running, compare the two `result.json`, focusing on **multi-dimensional metrics** rather than single `best_pass_rate`. + +**Experiment in 3 steps**: + +**Step 1: Use current configuration as baseline, fix other variables** + +```jsonc +// optimizer_baseline.json +{ + "optimize": { + "algorithm": { + "seed": 42, // Fix seed to exclude randomness + "max_metric_calls": 30, // Keep consistent with advanced to fairly compare + "candidate_selection_strategy": "pareto", + "frontier_type": "instance", + "skip_perfect_score": false, + "use_merge": false + } + } +} +``` + +**Step 2: Write advanced configuration, only change the switches to be verified** + +```jsonc +// optimizer_advanced.json (only differs from baseline by a few switches) +{ + "optimize": { + "algorithm": { + "seed": 42, + "max_metric_calls": 30, + "candidate_selection_strategy": "pareto", + "frontier_type": "objective", // Change: from instance to objective + "skip_perfect_score": true, // Change: skip perfect score cases to save reflection calls + "use_merge": true // Change: enable cross-field fusion (only actually triggers in multi-field) + } + } +} +``` + +**Step 3: Run twice + parse `result.json` to output multi-dimensional comparison** + +```bash +python run_baseline.py # Produce runs/baseline_/result.json +python run_advanced.py # Produce runs/advanced_/result.json +python compare.py # Parse two result.json, output comparison table +``` + +Dimensions `compare.py` should focus on: + +| Dimension | Field (indexed by camelCase in `result.json`) | Interpretation | +| --- | --- | --- | +| Final quality | `bestPassRate` / `baselinePassRate` | End-to-end score improvement; two strategies converge closely on most tasks | +| Exploration depth | `totalRounds` / `roundsAccepted` | Acceptance rate (`roundsAccepted / totalRounds`) reflects frontier acceptance threshold | +| Merge behavior | `mergeRoundsTotal` / `rounds[*].kind` | Verify `use_merge=true` actually triggers merge | +| Reflection budget | `metricCallsTotal` / `proposalsTotal` | `skip_perfect_score=true` saves more obviously on large training set + high baseline start | +| `stop_reason` | `stopReason` | Which stopper triggered; cannot directly compare when advanced/baseline have different stop_reason | + +> **Pitfall reminder**: Fields in `result.json` are camelCase (`bestPassRate` not `best_pass_rate`). SDK uses snake_case internally, automatically converted to camelCase during serialization through pydantic alias. Index by camelCase when reading `result.json`. + +**Expected performance of several advanced switches** (may not all hold on business tasks—use your own actual measurements as basis): + +| Switch | Expected Benefit | Applicable Prerequisites | +| --- | --- | --- | +| `frontier_type="objective"` (vs `"instance"`) | Higher acceptance rate / more aggressive exploration | Multi-metric scenario; may overfit train minibatch on small training set (< 10 cases) causing valset oscillation | +| `frontier_type="hybrid"` | Multiple metrics do not overwrite each other | Real conflict scenario with multiple metrics (see §4.5) | +| `skip_perfect_score=true` | Save reflection LM calls | Large-scale training set + high baseline start; few perfect score cases on small dataset, limited savings | +| `use_merge=true` | Cross-field fusion candidates | **Only actually triggers when multi-field (`add_path` ≄ 2)**; always 0 merge rounds in single-field configuration (`mergeRoundsTotal=0` is expected, see §4.3) | + +**Pre-integration checklist**: + +| Check Item | Consequence of Failure | +| --- | --- | +| Do the two configurations only differ in **the few switches to be verified**, all others identical? | No → Comparison result contains confounding variables, conclusion is not credible | +| Is `seed` consistent between the two sets? | No → Difference may come from randomness rather than configuration strategy | +| Is `max_metric_calls` consistent between the two sets? | No → One set naturally has higher score with more budget, cannot attribute to strategy | +| Are you simultaneously focusing on **multi-dimensional comparison** rather than single `bestPassRate`? | No → Final scores of two strategies are close on most tasks, cannot see difference; difference is hidden in arrival path | +| Do switches like `use_merge` / `skip_perfect_score` make sense in your task structure? | Enabling `use_merge` on single-field task never triggers (harmless but no benefit); enabling `skip_perfect_score` on high-baseline task saves considerably | + +> Advanced configuration is **not the more complex the better**. On many tasks, baseline configuration can already achieve reasonable convergence; advanced only shows value in specific task structures (multi-objective, multi-field, large-scale training set, etc.). **Use data to decide, not intuition**. + +**→ Complete example**: [`examples/optimization/advanced_strategies/`](../../../examples/optimization/advanced_strategies/) +- `optimizer_baseline.json` / `optimizer_advanced.json` — Two configurations for A/B control (only differ by 3 switches) +- `run_baseline.py` / `run_advanced.py` — Two independent entries (keeping other variables consistent) +- `compare.py` — Standard template for parsing two `result.json` and outputting multi-dimensional comparison table + +## 5 How GEPA Works + +After running an optimization and watching the score increase from 0.4 to 0.85, you don't know **what exactly the framework did along the way**—what data did it read? What did the reflection LM see? On what basis did it decide to retain or discard a candidate? When SLO triggers, does it stop immediately or wait for the current round to finish? + +> **GEPA** = Genetic-Evolutionary Pareto, is a reflection-based evolutionary search algorithm ([gepa-ai/gepa](https://github.com/gepa-ai/gepa), MIT License). This framework wraps `gepa.optimize()` into `GepaReflectiveOptimizer` through `OPTIMIZER_REGISTRY`, and adds a layer of SDK adaptation (evaluation bridging, reflection feedback construction, stop determination, atomic disk persistence, etc.). + +### 5.1 What Exactly Runs in One Optimization Round + +**First remember three roles**—all subsequent diagrams and tables revolve around these three: + +| Role | Who Is It | What It Does | +| --- | --- | --- | +| **agent** | Your business agent (accessed through `call_agent`) | Receives one query, outputs one response | +| **judge / metric** | Configured evaluators in `evaluate.metrics` | Score agent responses (0~1) | +| **Reflection LM** | LLM configured in `algorithm.reflection_lm` | Views failure case feedback → generates new prompt candidates | + +**Round 0**: Run valset with baseline prompt → get baseline score (your "starting line") + +**Each subsequent round (reflective round)** follows these 5 steps: + +```text + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + │ Candidate prompt selected │ + │ in previous round │ + ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + ā–¼ + (1) Sample minibatch → Randomly sample N cases from trainset + (N = reflection_minibatch_size) + │ + ā–¼ + (2) Run one evaluation → Write candidate to prompt file + → Call call_agent to run these N cases + → Metric scores, get failure cases + │ + ā–¼ + (3) Reflection LM → Feed failure case feedback to + generates candidate reflection LM + → It outputs new prompt text + │ + ā–¼ + (4) Re-evaluate + enter → Re-run new candidate on minibatch + Pareto frontier → Better than historical → enter + frontier, otherwise discard + │ + ā–¼ + (5) Check stop conditions → Any of 6 stoppers triggered → stop + → Otherwise enter next round +``` + +**Several key explanations**: + +- **"Evaluation" in step (2)** actually runs `len(minibatch) Ɨ num_runs Ɨ len(metrics)` LLM evaluations (see §6.1 for details) +- **"What reflection LM sees" in step (3)** determines rewrite quality—this is the content of next section §5.2 +- **"Pareto frontier" in step (4)** simply put is "retain the set of candidates that are not surpassed in all aspects"; specific granularity is controlled by `frontier_type` (see §5.3 for details) +- **"Stop when any triggers" in step (5)** has a detail: after triggering, **wait for current round to finish before actually stopping**, not immediately kill (see §5.4 for details) +- **Valset evaluation** is interleaved in the middle rounds (determined internally by gepa), used to calculate the "real score of current best candidate on valset", also the basis for stopper judgments such as `score_threshold` / `required_metrics` + +**Special case: merge round** + +When `use_merge=true`, a **merge round** is inserted every several reflective rounds: select two candidates from the Pareto frontier and fuse them into one new candidate ("take A's wording on field X + B's wording on field Y"). **Only meaningful in multi-field scenarios**—never triggers in single-field, `mergeRoundsTotal=0` is expected. See §4.3 for details. + +### 5.2 What Reflection LM Actually Sees + +The quality of the reflection LM's prompt rewriting **completely depends on how rich the failure feedback it can see**. If you only tell it "case_3 failed, score 0.3", it can only guess blindly; if you tell it "case_3 turn 2 agent should output `{"city":"Shanghai"}` but actually output `Shanghai`, rule requires case-sensitive exact match", it can targetedly modify the prompt. + +`_AgentGEPAAdapter.make_reflective_dataset` renders a markdown record for each **failed case**, fed to the reflection LM. Each record field: + +| Field | One-Line Explanation | When It Appears | +| --- | --- | --- | +| `case_id` | Stable ID of the case (for reflection LM cross-reference) | Always | +| `score` | Aggregate score of this case (0~1, 1.0 = all metrics passed) | Always | +| `Case Body` | Markdown of failure scene: one segment per turn, containing user input, expected response, agent actual response, tool call trace, each metric's judgment (PASS/FAIL + score + failure reason) | Always | +| `Other Active Components` | What do other prompt fields NOT being rewritten in this round look like | When multi-field optimization—lets reflection LM see B/C status when modifying A, avoiding breaking upstream/downstream compatibility | +| `history_top_k` | Best agent responses for this case in history (sorted by score) | When `reflection_history_top_k > 0` | + +**Specific structure of `Case Body`**: + +```text +### Turn 1 +**User**: +**Expected**: +**Agent Response**: +**Tool Trace**: ← Only when tool calls exist + - tool_name(args) → response +**Verdict** (Turn 1): + [FAIL] metric_name: score=0.0000, threshold=1.0000 + reason: agent output not byte-equal to expected (case-sensitive) + Ā· rubric[no_emoji]: PASS score=1.00 ← Only for LLM rubric metric + +### Turn 2 +... + +### Overall (case-level aggregate) ← When multi-turn or multi-run +... +``` + +**Failure reason synthesis for deterministic metrics**: When metric is an evaluator without LLM judge like `final_response_avg_score`, only outputting score+status, the framework will **automatically synthesize a failure explanation** (e.g.: `agent output not byte-equal to expected (case-sensitive)` / `expected substring not contained in agent output (case-insensitive)` / `JSON structural comparison failed`), letting the reflection LM directly see **why it didn't match**, without having to diff text to guess. + +> Want to see the full reflection prompt that the reflection LM actually receives? Set `verbose=2` when running optimization, gepa internal logs will include each round's reflection prompt text—read it once and you'll have a good understanding. + +### 5.3 Actual Behavior of 5 Core Operators + +The 5 switches most frequently asked about in the `optimize.algorithm` section of `optimizer.json`, what they actually do in the source code: + +| Operator | One-Line Function | Typical Motivation to Adjust It | Detailed Reference | +| --- | --- | --- | --- | +| `reflection_minibatch_size` | How many cases the reflection LM sees each round | Smaller saves tokens, larger gives reflection LM more complete view | [§7.3.3](#733-optimizealgorithm-section) | +| `module_selector` | Which field to modify this round in multi-field (`round_robin` rotation / `all` select all / `random` random) | Want clear attribution of each field's contribution → `round_robin` | [§4.3](#43) | +| `frontier_type` | Pareto frontier granularity (`instance` one best per case / `objective` one per metric / `hybrid` two-layer / `cartesian` Cartesian product) | When multiple metrics truly conflict → `hybrid` | [§4.5](#45) | +| `candidate_selection_strategy` | How to select parent for next round's reflection (`pareto` default select from frontier / `current_best` use current best / etc.) | Want to accelerate convergence or increase exploration | [§7.3.3](#733-optimizealgorithm-section) | +| `use_merge` + `max_merge_invocations` | Whether to enable cross-field fusion + upper limit on trigger count | **Only actually triggers in multi-field**—`mergeRoundsTotal=0` is expected in single-field | [§4.3](#43) / [§4.8](#48) | + +### 5.4 Stop Timing: Complete Current Round Before Stopping + +6 algorithm-level stop conditions (`max_metric_calls` / `timeout_seconds` / `no_improvement` / `score_threshold` / `max_candidate_proposals` / `max_tracked_candidates`) are **synchronously checked at the end of each round**—stop when any condition is satisfied. + +**3 easily stepped-on details**: + +| Detail | Meaning | How to Avoid | +| --- | --- | --- | +| **Does not immediately kill current round** | When stop is triggered, it will not interrupt the currently running round; must wait for current round to finish before actually stopping | In SLO hard deadline scenarios, set `timeout_seconds` to about 50% of the real business window, leave buffer | +| **Actual termination time often exceeds `timeout_seconds`** | Direct consequence of the previous point—especially obvious when stuck in a long round | Add your own timeout to LLM calls inside `call_agent` (refer to 90s timeout in §4.2 CLI) | +| **Priority when multiple stoppers trigger simultaneously** | `framework_stopper` (`required_metrics` policy) first; then take the first one in algorithm-level stopper insertion order | `OptimizeResult.stop_reason` field records the trigger, see which one triggered directly after running | + +**`stop_reason` value reference** (`OptimizeResult.stop_reason`): + +``` +required_metrics_passing ← framework-level (highest priority) +score_threshold ← Reached target score +budget_exhausted ← max_metric_calls +timeout ← timeout_seconds +no_improvement ← max_iterations_without_improvement +max_candidate_proposals +max_tracked_candidates +user_requested_stop ← User touched optimize.stop file +completed ← No stopper triggered, gepa naturally finished +``` + +### 5.5 A Special Case: FAILED + +Normally `OptimizeResult.status = "SUCCEEDED"`—gepa finished the loop (natural end / stopper trigger both count). But there is one special status worth user attention: + +- **`status = "FAILED"`**: gepa threw an exception during running (most common: training/validation set loading failure, `gepa.optimize()` internal exception, reflection LM call failure) +- **At this time `best_prompts` is forcibly set to `baseline_prompts`**—ensuring the artifacts you get **will never be worse than baseline** +- **`update_source=True` will not write back** source prompt files when FAILED (see §3.4 decision table for details) + +Another easily confused point is "finished running but no improvement": in this case `status` is still `"SUCCEEDED"`, but `finish_reason="no_improvement"`, and `best_prompts == baseline_prompts`—`summary.txt` will show `baseline → baseline` (no degradation nor improvement). This is expected, not a bug. + + +## 6 Cost and Concurrency + +How many LLM calls does one optimization run require? Which knobs affect call volume, which affect concurrency, which affect both? + +### 6.1 Where LLM Calls in One Optimization Come From + +LLM calls are divided into two parts—**evaluation side eats the vast majority**, reflection side is just a fraction: + +**Evaluation side (agent + judge)**: Run each of these once, each calls LLM once— + +```text +Run one baseline evaluation: Run valset fully once ← Starting point, 1 time +Each reflective round: Sample N cases and run once + re-run candidate ← Main cost +Specific reflective round: Re-evaluate current best candidate on valset ← Determined by gepa +``` + +Actual LLM call count triggered by each "run once" = **number of cases Ɨ agent call count per case Ɨ `num_runs` Ɨ judge call count per metric**. Among them: + +| Multiplier | Source | Typical Value | +| --- | --- | --- | +| Agent call count per case | Evalset data; accumulate by turn count in multi-turn conversation | Single turn = 1, multi-turn = N | +| `evaluate.num_runs` | Run each case several times and take mean to eliminate LLM output variance | 1 (default, saves) / 2~3 (recommended, stable) | +| Judge call count per metric | Depends on metric type: `final_response_avg_score` type deterministic matching = 0 times; `llm_judge` / `llm_rubric_response` ≄ 1 time (however many are in `judge_models` array) | 0~3 | + +**Reflection side (reflection LM)**: + +```text +Each reflective round: 1 time (generate new candidate prompt) +Each merge round: 1 time (only when use_merge=true and multi-field) +``` + +Reflection side call count is much less than evaluation side—usually 5~20 times for a complete optimization. + +### 6.2 What to Read from result.json After Running + +Fields actually recorded in `OptimizeResult` (camelCase indexed in artifact `result.json`): + +| Field | Meaning | +| --- | --- | +| `totalMetricCalls` | Cumulative case-level evaluation count by gepa | +| `totalReflectionLmCalls` | Cumulative reflection LM call count (including retries) | +| `totalTokenUsage` | Cumulative tokens for reflection LM: `{prompt, completion, total}` | +| `durationSeconds` | Total wall-clock duration | + +When needing to estimate actual USD cost on the business side, use `totalTokenUsage` Ɨ LLM backend unit price to reverse-calculate reflection side; agent / judge side is pulled from LLM backend usage records (API console / billing reports). + +### 6.3 Multiplier Effect of 4 Commonly Used Knobs + +Sorted by "magnitude of impact on total call volume" from large to small—when encountering optimization running out of budget, adjust the ones above first: + +| Knob | Multiplies By How Much | Cost of Turning Down | Details | +| --- | --- | --- | --- | +| `algorithm.max_metric_calls` | **Hard upper limit on total call volume**—gepa stops when cumulative reaches it | Too small → Stopped by it in the 1st round; cannot see any score improvement | [§4.7](#47) | +| `evaluate.num_runs` | **Multiply by N**—run each case N times and take mean | LLM output variance directly enters score when 1 (same prompt gets different scores on two runs); recommend ≄ 2 | [§4.5](#45) | +| `optimize.eval_case_parallelism` | **Does not affect total volume**, only affects **wall-clock time** and **instantaneous QPS** | Higher saves time but easily hits LLM backend rate limit | [§4.5](#45) | +| `algorithm.reflection_minibatch_size` | **Multiply by a few**—how many cases the reflection LM sees each round; evaluation side also calculates by this number | Too large → Reflection prompt explodes LLM context window | [§4.3](#43) | + +### 6.4 Want to Reasonably Set Thresholds? Run a Baseline First + +Before setting thresholds such as `timeout_seconds` / `max_metric_calls`, **first run a baseline with default configuration**—read two numbers from the artifacts: + +| Value to Measure | How to Test | How to Use | +| --- | --- | --- | +| **Typical single-round duration** | `rounds[*].durationSeconds` in `runs//result.json` (take median) | `timeout_seconds` should be at least single-round duration Ɨ 2, otherwise stop is triggered in round 1 and you cannot see optimization progress | +| **Single-round metric_calls** | Same as above, `totalMetricCalls / totalRounds` | `max_metric_calls` should be able to run through at least `max_iterations_without_improvement` rounds, otherwise budget always triggers stop first | + +**Example**: Baseline run shows 30 seconds per round, 4 metric_calls per round, CI window 5 minutes—then `timeout_seconds=120` (leave buffer), `max_metric_calls=24` (enough to run 6 rounds for `max_iterations_without_improvement=3` to trigger stop). + +### 6.5 Single-Round Instantaneous LLM QPS Control + +Number of LLM requests concurrently sent in a single round: + +```text +Single-round instantaneous LLM QPS ā‰ˆ eval_case_parallelism + Ɨ num_runs + Ɨ (agent calls per case + all judge calls) +``` + +**Typical scenario estimation**: 3 judges + `num_runs=2` + `eval_case_parallelism=4` + 1 agent call per case + 3 judge calls → about 32 LLM requests per round instantaneous. When LLM backend rate limit is 30 QPS, this configuration will inevitably trigger rate limiting. + +**Two parameters to control instantaneous QPS** (sorted by effect): + +| Parameter | Impact | Applicable | +| --- | --- | --- | +| `eval_case_parallelism` | Directly reduces concurrent case count | First choice for most situations; set to `1` for serial execution in scenarios with intensive single-case calls such as black-box CLI, multi-judge (see [§4.2](#42), [§4.5](#45)) | +| `num_runs` | Reduces repeated evaluation per case | Sacrifices some variance stability; recommend only lowering after confirming LLM output variance is small | + +### 6.6 Reflection LM Selection and Configuration + +The output quality of the reflection LM directly determines prompt rewriting quality. Configuration location (`optimizer.json`): + +```jsonc +{ + "optimize": { + "algorithm": { + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, // Reflection prompt is long, leave enough output space + "temperature": 0.6 // Between 0.6~0.8, let LM be creative + } + } + } + } +} +``` + +**Two suggestions**: + +- **Can be configured independently from agent / judge**—the `reflection_lm` section is independent, business can choose different model (avoid "self-evaluation" bias, or purely because reflection tasks require higher model reasoning power) +- **Token usage is truly recorded**—the `totalTokenUsage` field will accumulate actual prompt + completion + total token count for reflection LM; reverse-calculate USD by LLM backend unit price + + +## 7 Complete API Reference + +Reference manual section, organized by "what parameter are you looking for". **Each table has a "Required" column**, three-gear meaning: + +- **Required**: Not passed/not configured → fail-fast error at startup +- **Optional**: Can be omitted; uses default value when not configured +- **Conditionally Required**: Can be omitted when looking at the entry alone, but **must be configured when satisfying certain conditions**—conditions written in the "Condition" column at the end of each entry + +All fields are based on actual source code (source file path annotated in each table header). + +### 7.1 `AgentOptimizer.optimize` Parameter Table + +Source code: `trpc_agent_sdk/evaluation/_agent_optimizer.py:AgentOptimizer.optimize`. **11 keyword-only parameters**—must be passed in `key=value` form, positional parameters not accepted. + +| Parameter | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `config_path` | **Required** | `str` | — | optimizer.json configuration file path | +| `call_agent` | **Required** | `async (str) -> str` | — | Business agent adapter function; signature fixed as "accept query return str" | +| `target_prompt` | **Required** | `TargetPrompt` | — | Register which prompt fields are optimization targets (at least 1, otherwise error) | +| `train_dataset_path` | **Required** | `str` | — | Training evalset file path | +| `validation_dataset_path` | **Required** | `str` | — | Validation evalset file path; **must be different from `train_dataset_path`** (prevent data leakage, framework will normalize paths before comparing) | +| `output_dir` | **Required** | `str` | — | Artifact directory; created automatically if it doesn't exist | +| `callbacks` | Optional | `Optional[Callbacks]` | `None` | Evaluator lifecycle callbacks (rarely used) | +| `update_source` | Optional | `bool` | `False` | Whether to write back to source prompt files after successful optimization (decision table see [§3.4](#34-agentoptimizer)) | +| `verbose` | Optional | `int` | `1` | Terminal output verbosity: `0` silent / `1` default Rich panel / `2` plus gepa internal log forwarding | +| `extra_stop_callbacks` | Optional | `Optional[Sequence]` | `None` | Stoppers appended at runtime (SLO monitoring / kill switch, etc.); ordinary callable displays as `stop_reason="completed"`, use `_LabeledStopper` wrapper or expose `.label` attribute when needing stable labels | +| `extra_gepa_callbacks` | Optional | `Optional[Sequence]` | `None` | Gepa event callbacks appended at runtime (e.g., forwarding to dashboard); need to implement `gepa.core.callback.GEPACallback` protocol | + +**Return value**: `OptimizeResult` (see [§7.4](#74-optimizeresult--roundrecord-field-table) for details). + +**Fail-fast checks at startup** (`_validate_inputs`): + +| Situation When Check Fails | Throws | +| --- | --- | +| `output_dir` is empty string | `ValueError` | +| `target_prompt` did not register any fields | `ValueError` | +| `call_agent` is not async function (including `__wrapped__` check, supports `functools.partial` wrapped async) | `TypeError` | +| `train_dataset_path` and `validation_dataset_path` resolve to the same file (compared after normalizing with `os.path.normpath(os.path.abspath(...))`) | `ValueError` (prevent data leakage) | +| `evaluate.metrics` contains `tool_trajectory_avg_score` or `llm_rubric_knowledge_recall`—these two require session traces / tool intermediate_data, which cannot be obtained in `call_agent` black-box mode | `ValueError` | +| `algorithm.name` in config is not registered in `OPTIMIZER_REGISTRY` | `ValueError` (message lists all registered algorithm names) | +| `use_merge=true` and `TargetPrompt` field count < 2 | `UserWarning` (not fatal, but `mergeRoundsTotal` will always be 0) | + +### 7.2 `TargetPrompt` API Table + +Source code: `trpc_agent_sdk/evaluation/_target_prompt.py`. A container for registering multi-field prompts, supports both file source and callback source forms. + +| Method | Signature | Behavior | +| --- | --- | --- | +| `add_path(name, path)` | `(str, str) -> Self` | Register file source field; `name` must be unique; returns self for chained calls | +| `add_callback(name, *, read, write)` | `(str, *, AsyncRead, AsyncWrite) -> Self` | Register callback source field; `read: async () -> str`, `write: async (str) -> None` must both be async; `name` must be unique | +| `names()` | `() -> list[str]` | Return field names (in registration order) | +| `describe_source(name)` | `(str) -> str` | File source returns path; callback source returns literal `""`; unknown name throws `KeyError` | +| `read(name)` | `async (str) -> str` | Read single field | +| `read_all()` | `async () -> dict[str, str]` | Read all fields (in registration order) | +| `write_all(prompts)` | `async (dict[str, str]) -> None` | **Atomically write all fields** (see contract below for details) | + +**Atomicity contract of `write_all`** (from source code comments): + +1. **File source atomic write**: First write to `.tmp`, then `os.replace` rename (POSIX guarantees rename atomicity) +2. **Failure rollback**: When any file write fails, already successfully written files roll back to pre-call content, clean up residual `.tmp`, original exception normally re-raised +3. **Rollback itself fails**: Original exception is preserved through `__context__`, and `_RollbackError` is raised listing each field's rollback failure details—rollback is best-effort, one field's failure does not skip subsequent ones +4. **Callback source does not rollback**: After file source writes successfully, then run callback sources in order; when callback source fails, file source rolls back to baseline, but **callback source itself does not rollback** (idempotency is caller's responsibility) + +**Key validation of `write_all`**: The key set of incoming `prompts` must **exactly equal** the registered field name set, otherwise throws `ValueError`. + +### 7.3 `optimizer.json` Configuration Items Table + +Source code: `trpc_agent_sdk/evaluation/_optimize_config.py`. pydantic schema, **supports both camelCase and snake_case keys**. Top-level structure: + +```jsonc +{ + "evaluate": { ... }, // Evaluation section (same schema as AgentEvaluator) + "optimize": { // Optimizer section + "eval_case_parallelism": 4, + "stop": { ... }, // Framework-level stop + "algorithm": { ... } // Algorithm block (including reflection_lm) + } +} +``` + +#### 7.3.1 `evaluate` Section + +Source code: `_eval_config.py:EvalConfig`. + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `metrics` | **Conditionally Required** (see below) | `Optional[list[dict]]` | `None` | Metric array, each containing `metric_name` / `threshold` / `criterion`. **When `metrics` is configured, `criteria` is ignored** | +| `criteria` | **Conditionally Required** (see below) | `dict[str, Any]` | `{}` | Old-style shorthand: `metric_name → threshold` or `{threshold, criterion}` | +| `num_runs` | Optional | `int` | `1` | How many times to run each case and take mean (eliminate LLM output variance); `≄ 2` recommended | +| `user_simulator_config` | Optional | `Optional[Any]` | `None` | User simulator configuration (multi-turn scenarios; rarely used) | + +**Condition**: At **least 1** of `metrics` and `criteria` must be configured—when both are empty, `evaluate.get_eval_metrics()` returns empty list, and startup will report error due to no metrics. New integrations recommend using `metrics` (more structured), `criteria` is mainly kept for compatibility with old configurations. + +#### 7.3.2 `optimize` Section + +Source code: `_optimize_config.py:OptimizeConfig`. + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `eval_case_parallelism` | Optional | `int` | `4` | Case concurrency within same round (does not affect total call volume, affects instantaneous QPS) | +| `stop` | Optional | `FrameworkStopConfig` | `{required_metrics: "all"}` | Framework-level stop section (see [§7.3.5](#735-optimizestop-section) for details) | +| `algorithm` | **Required** | `GepaReflectiveAlgo` | — | Algorithm block (see [§7.3.3](#733-optimizealgorithm-section) for details) | + +#### 7.3.3 `optimize.algorithm` Section + +Source code: `_optimize_config.py:GepaReflectiveAlgo`. All adjustable parameters for the `gepa_reflective` algorithm. + +> **Hard constraint**: Among the **last 6 stopper fields** in the table, **at least 1 must be configured**—if all are left empty (default `None`), it will be rejected by `_require_at_least_one_stop_condition`, throwing `ValueError` fail-fast. This is why they are marked as "Conditionally Required". + +**Basic fields**: + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `name` | **Required** | `Literal["gepa_reflective"]` | — | Algorithm selector; currently the only optional value | +| `reflection_lm` | **Required** | `OptimizeModelOptions` | — | Reflection LM configuration (see [§7.3.4](#734-optimizealgorithmreflection_lm-section) for details) | +| `seed` | Optional | `int` | `42` | Random seed; two sets of configurations should be consistent when A/B testing | + +**Search behavior fields**: + +| Field | Required | Type | Default | Values and Description | +| --- | --- | --- | --- | --- | +| `candidate_selection_strategy` | Optional | Literal | `"pareto"` | `pareto` select from frontier (default recommended) / `current_best` use current best / `epsilon_greedy` exploration-exploitation / `top_k_pareto` random from top K of frontier | +| `module_selector` | Optional | `str` | `"round_robin"` | Which field to modify this round in multi-field: `round_robin` rotate in registration order / `all` select all / `random` random | +| `frontier_type` | Optional | Literal | `"instance"` | Pareto frontier granularity: `instance` one best per case / `objective` one per metric / `hybrid` two-layer / `cartesian` Cartesian product | +| `reflection_minibatch_size` | Optional | `Optional[int]` | `None` | Minibatch size for each round's reflection; `None` lets gepa decide | +| `reflection_history_top_k` | Optional | `int` (0~5) | `2` | How many historical best responses to give reflection LM for each case; 0 disables, upper limit 5 | +| `perfect_score` | Optional | `float` | `1.0` | "Perfect score" threshold (used with `skip_perfect_score`) | +| `skip_perfect_score` | Optional | `bool` | `True` | Skip cases that already have perfect score during reflection | + +**Multi-field fusion (merge) fields**: + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `use_merge` | Optional | `bool` | `False` | Enable merge round; **only actually triggers in multi-field (≄2)**, never triggers in single-field and won't report error (only `UserWarning`) | +| `max_merge_invocations` | Optional | `int` | `5` | Upper limit on merge trigger count | +| `merge_val_overlap_floor` | Optional | `int` | `5` | Minimum val set case overlap count to trigger merge | + +**Performance fields**: + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `cache_evaluation` | Optional | `bool` | `False` | Cache (candidate, case) scores; skip directly on repeated evaluation | +| `track_best_outputs` | Optional | `bool` | `False` | Track best output for each case | + +**6 stop condition items**—**configure at least 1** (OR semantics trigger): + +| Field | Required | Type | Default | Trigger Condition | +| --- | --- | --- | --- | --- | +| `max_metric_calls` | Conditionally Required | `Optional[int]` | `None` | Cumulative case-level evaluation count ≄ N → stop | +| `max_iterations_without_improvement` | Conditionally Required | `Optional[int]` | `None` | N consecutive rounds without best valset improvement → stop | +| `timeout_seconds` | Conditionally Required | `Optional[float]` | `None` | Wall-clock exceeds N seconds → stop | +| `score_threshold` | Conditionally Required | `Optional[float]` | `None` | Best valset score ≄ N → stop | +| `max_candidate_proposals` | Conditionally Required | `Optional[int]` | `None` | Candidate proposal count ≄ N → stop | +| `max_tracked_candidates` | Conditionally Required | `Optional[int]` | `None` | Pareto candidate pool size ≄ N → stop | + +**Condition**: At least 1 of the 6 items must be non-`None`, otherwise fail-fast at startup. See [§4.7 SLO Hard Constraints](#47) for details. + +#### 7.3.4 `optimize.algorithm.reflection_lm` Section + +Source code: `_optimize_model_options.py:OptimizeModelOptions`. Reflection LM connection configuration. + +> **Only need to configure 4 in daily use**: `model_name` / `base_url` / `api_key` / `generation_config` (leave others as default). The 6 items marked "advanced" in the table below generally do not need to be touched. + +| Field | Required | Type | Default | Description | +| --- | --- | --- | --- | --- | +| `model_name` | **Required** | `str` | `""` | Model name (e.g., `"gpt-4o-mini"`); empty string equals not configured, will report error at startup | +| `base_url` | Optional | `Optional[str]` | `None` | Custom endpoint URL | +| `api_key` | Optional | `str` | `""` | API key (most providers must provide, otherwise will report error at call stage) | +| `generation_config` | Optional | `Optional[dict]` | `None` | Generation parameters; typical: `{"max_tokens": 4096, "temperature": 0.6}` | +| `provider_name` | Advanced | `str` | `""` | Provider name; empty / `"openai"` goes to `OpenAIModel`, other values go to `ModelRegistry.create_model("{provider}/{model}")` | +| `variant` | Advanced | `str` | `""` | OpenAI-compatible variant (only when provider is openai) | +| `extra_fields` | Advanced | `Optional[dict]` | `None` | Extra fields transparently passed to underlying model | +| `num_samples` | Advanced | `Optional[int]` | `None` | Number of samples | +| `weight` | Advanced | `float` | `1.0` | Weight (multi-judge scenarios) | +| `think` | Advanced | `Optional[bool]` | `None` | Whether to enable thinking mode | + +**Field values support environment variable expansion**—`"${TRPC_AGENT_API_KEY}"` will be automatically replaced. + +#### 7.3.5 `optimize.stop` Section + +Source code: `_optimize_config.py:FrameworkStopConfig`. + +| Field | Required | Type | Default | Values | +| --- | --- | --- | --- | --- | +| `required_metrics` | Optional | `Optional[Union[Literal["all"], list[str]]]` | `"all"` | `"all"`: all metrics must reach threshold; `["m1", "m2"]`: listed metrics must reach threshold (other metrics still participate in evaluation but do not affect early stop); `null` or `[]`: disable framework-level early stop (rely only on algorithm-level stoppers) | + +**List form validation**: Metric names in the list must be findable in `evaluate.metrics[]`, otherwise `OptimizeConfigFile._validate_required_metrics_against_evaluate` throws `ValueError` at startup, error message lists "unknown metrics" and "available metrics" checklist. + +### 7.4 `OptimizeResult` + `RoundRecord` Field Table + +Source code: `trpc_agent_sdk/evaluation/_optimize_result.py`. This is the return value of `optimize()`, and also the content of `runs//result.json`. + +> **Important convention**: Both `OptimizeResult` and `RoundRecord` are based on `EvalBaseModel` (`alias_generator=to_camel`). **Python in-memory uses snake_case, all converted to camelCase when serialized to JSON**—use camelCase when indexing `result.json` (`bestPassRate` not `best_pass_rate`), common pitfall. In the table below, the "Field" column uses Python names (snake_case), switch to camelCase when reading JSON. + +#### 7.4.1 `OptimizeResult` Top-Level Fields + +**Core result fields**: + +| Field (snake_case) | Type | Meaning | +| --- | --- | --- | +| `status` | `Literal["SUCCEEDED", "FAILED", "CANCELED"]` | Final status; when `FAILED`, `best_prompts = baseline_prompts` | +| `finish_reason` | Literal | `completed` / `perfect_pass_rate` / `no_improvement` / `error` | +| `stop_reason` | `Optional[StopReason]` | Which stopper triggered (see [§5.4](#54-stop-timing-complete-current-round-before-stopping) for details); `None` when FAILED early stop | +| `error_message` | `str` | Error message when FAILED (default `""`) | +| `algorithm` | `str` | Algorithm name (e.g., `"gepa_reflective"`) | + +**Score fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `baseline_pass_rate` | `float` | Pass rate of baseline on valset | +| `best_pass_rate` | `float` | Pass rate of optimal candidate on valset | +| `pass_rate_improvement` | `float` | `best - baseline` | +| `baseline_metric_breakdown` | `dict[str, float]` | Mean score of each metric for baseline | +| `best_metric_breakdown` | `dict[str, float]` | Mean score of each metric for optimal candidate | +| `metric_thresholds` | `dict[str, float]` | Threshold for each metric (copied from `evaluate.metrics[].threshold`) | +| `per_metric_best_candidates` | `dict[str, list[int]]` | Pareto frontier candidate index for each metric (0-based); empty = algorithm does not expose this information | + +**Prompt fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `baseline_prompts` | `dict[str, str]` | Starting prompt content (keyed by TargetPrompt field names) | +| `best_prompts` | `dict[str, str]` | Optimal candidate prompts; = `baseline_prompts` when `FAILED` (ensuring artifacts **will never be worse than baseline**) | + +**Round fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `total_rounds` | `int` | How many rounds were run | +| `rounds` | `list[RoundRecord]` | Each round's record (see §7.4.2 for details) | + +**Statistics and time fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `total_reflection_lm_calls` | `int` | Cumulative reflection LM call count (including retries) | +| `total_token_usage` | `dict[str, int]` | Cumulative tokens for reflection LM: `{prompt, completion, total}` | +| `duration_seconds` | `float` | Total wall-clock duration | +| `started_at` / `finished_at` | `str` | ISO-8601 timestamps | + +**Others**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `schema_version` | `str` | Default `"v1"`; bump when artifact schema upgrades | +| `extras` | `dict[str, Any]` | Custom business fields; optimizer does not read or write | + +#### 7.4.2 `RoundRecord` Fields (One Per Round) + +**Basic round information**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `round` | `int` | 1-based round number | +| `kind` | `Literal["reflective", "merge"]` | Reflection round / fusion round | +| `started_at` | `str` | ISO-8601 timestamp | +| `duration_seconds` | `float` | Wall-clock duration of this round | + +**Rewrite situation**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `optimized_field_names` | `list[str]` | Field names rewritten by reflection LM in this round | +| `candidate_prompts` | `dict[str, str]` | Full field content of this round's candidate | +| `accepted` | `bool` | Whether accepted as new best | +| `acceptance_reason` | `str` | Human-readable explanation of acceptance decision | +| `per_field_diagnosis` | `dict[str, str]` | Diagnosis text given by reflection LM for each field | + +**Scoring situation**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `validation_pass_rate` | `float` | Pass rate of this round on valset | +| `metric_breakdown` | `dict[str, float]` | Mean score of each metric on valset this round; empty = this round did not run valset | +| `failed_case_ids` | `list[str]` | Failed case IDs on valset this round | +| `failed_cases_truncated` | `int` | Number of failed cases cut off due to token budget | +| `train_minibatch_size` | `int` | Minibatch size of this round; 0 = skip, not sampled | +| `train_subsample_parent_score` | `Optional[float]` | Parent candidate's score on minibatch; `None` = not run | +| `train_subsample_candidate_score` | `Optional[float]` | New candidate's score on minibatch; `None` = not run | +| `skip_reason` | `Optional[str]` | Skip reason (e.g., `"subsample perfect"`, `"no proposal"`) | +| `error_message` | `Optional[str]` | Algorithm error message this round | + +**Statistical fields**: + +| Field | Type | Meaning | +| --- | --- | --- | +| `reflection_lm_calls` | `int` | Reflection LM call count this round (including retries) | +| `round_token_usage` | `dict[str, int]` | Reflection LM tokens this round: `{prompt, completion, total}` | +| `budget_used` | `Optional[int]` | Cumulative used metric_calls | +| `budget_total` | `Optional[int]` | Configured budget upper limit (e.g., `max_metric_calls`) | + +**`extras`** (`dict[str, Any]`): Custom business fields; optimizer does not read or write. + +#### 7.4.3 `OptimizeResult` Utility Methods + +| Method | Behavior | +| --- | --- | +| `dump_to(path)` | Serialize to JSON file (`indent=2`, `by_alias=True`) | +| `OptimizeResult.from_file(path)` | classmethod, deserialize from JSON | +| `format_summary(*, output_dir, update_source)` | Generate human-readable text for `summary.txt` | + + +## 8 Artifacts and Directory Conventions + +Each time `optimize()` is run, the framework persists a complete set of audit artifacts under `output_dir`. All writes are **atomic**—SIGINT / process crash will not leave half-written files. + +### 8.1 Directory Layout + +```text +runs// +ā”œā”€ā”€ result.json Complete OptimizeResult serialization (programmatic entry) +ā”œā”€ā”€ summary.txt Human-readable summary (see baseline → best at a glance) +ā”œā”€ā”€ config.snapshot.json Complete snapshot of optimizer.json used this run (reproducible) +ā”œā”€ā”€ run.log Single-line status, CI parsing friendly +│ +ā”œā”€ā”€ baseline_prompts/ Prompt snapshots before running (one .md per field) +│ ā”œā”€ā”€ system_prompt.md +│ └── ... +│ +ā”œā”€ā”€ best_prompts/ Optimal candidate from optimization (one .md per field) +│ ā”œā”€ā”€ system_prompt.md +│ └── ... +│ +└── rounds/ Complete RoundRecord for each round + ā”œā”€ā”€ round_001.json + ā”œā”€ā”€ round_002.json + └── ... +``` + +Role of each file: + +| File / Directory | When Written | What It's For | +| --- | --- | --- | +| `result.json` | Optimization ends (including failure) | Most authoritative artifact for programmatic reading. Complete `OptimizeResult` serialization (see [§7.4](#74-optimizeresult--roundrecord-field-table) for details). **Field names are camelCase** | +| `summary.txt` | Optimization ends (only success) | Human-readable summary: `baseline → best` trend, metric breakdown, all best fields + character count, artifact directory index | +| `config.snapshot.json` | Optimization starts | Complete snapshot of `optimizer.json` used this run—directly use it later when wanting to "re-run this result" | +| `run.log` | Optimization ends | Single line: ` status=... algorithm=... baseline=0.4 best=0.85 delta=+0.45 rounds=10 duration_seconds=120.5`; CI platform grep-friendly | +| `baseline_prompts/.md` | Optimization starts | Content snapshot of each TargetPrompt field before running—**written regardless of `update_source` setting** (most important fallback artifact) | +| `best_prompts/.md` | Optimization ends (only when result exists) | Optimal candidate prompts—when `update_source=False`, this is the most valuable artifact (awaiting manual review and synchronization) | +| `rounds/round_.json` | Each round ends | Complete `RoundRecord` serialization (see [§7.4.2](#742-roundrecord-fields-one-per-round) for details); 3-digit zero-padded numbering for easy sorting | + +### 8.2 Sentinel File: Letting Users Actively Stop Optimization + +Source code: `_optimize_gepa_reflective.py:_build_stop_callbacks` end. + +During optimization, the user manually `touch optimize.stop` under `output_dir`: + +```bash +touch runs//optimize.stop +``` + +The framework detects this file at the beginning of the next round and stops (`gepa.utils.FileStopper` implementation), `stop_reason="user_requested_stop"`. **Typical use case**: discovered it's already sufficient after running halfway / temporarily need to release LLM quota—more elegant than Ctrl+C, ensures current round completes and disk persistence is clean. + +### 8.3 Atomic Disk Persistence Guarantee + +**All artifacts use tmp + `os.replace` atomic write**—POSIX guarantees rename atomicity, when process is kill / power failure, either clean old file or clean new file exists in `output_dir`, **will never appear in half-written state**. + +Source code: Two utility functions in `_agent_optimizer.py`: + +- `_atomic_write_text(path, content)`: First write to `.tmp`, then `os.replace(tmp, path)` +- `_mask_sigint`: Context manager, shields SIGINT during `_persist_artifacts` (avoid "second Ctrl+C interrupts finally disk persistence") + +**Source prompt file write-back when `update_source=True`**: Uses `TargetPrompt.write_all`, also guarantees atomicity for **multi-field**—when any field write fails, all already successfully written fields roll back to pre-call content (see `write_all` contract in [§7.2](#72-targetprompt-api-table) for details). + +> **Extreme fault tolerance**: If `os.replace` itself fails when `update_source=True` writes source files (e.g., target file's directory was concurrently deleted), the framework will **explicitly call `write_all(baseline)` to restore source files to pre-run content**, then re-raise the original exception—ensuring business never gets a "half-optimized" source file. + + +## 9 Want to Extend Yourself? + +Source code main entry: `_optimize_registrations.py`. The framework supports three types of extensions through a **registration mechanism**, no need to fork the SDK. + +### 9.1 Register New Algorithm + +Source code: `_base_optimizer.py:BaseOptimizer` + `_optimize_registry.py:OPTIMIZER_REGISTRY`. + +Write a `BaseOptimizer` subclass, implement `async def run(self, *, reporter=None) -> OptimizeResult`, register to `OPTIMIZER_REGISTRY`: + +```python +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + +class MyOwnOptimizer(BaseOptimizer): + async def run(self, *, reporter=None) -> OptimizeResult: + # Your algorithm main loop. Base class has already injected: + # self.config - OptimizeConfigFile (including evaluate / optimize two sections) + # self.call_agent - Business agent adapter function + # self.target_prompt - TargetPrompt instance + # self.train_dataset_path / self.validation_dataset_path + # self.callbacks / self.output_dir + # self.extra_stop_callbacks / self.extra_gepa_callbacks + ... + return OptimizeResult(...) + + +# Registration: second parameter must be BaseOptimizer subclass, otherwise register() throws TypeError +OPTIMIZER_REGISTRY.register("my_own_algo", MyOwnOptimizer) +``` + +Business side usage: Change `optimize.algorithm.name` in `optimizer.json` to `"my_own_algo"`, the framework finds your class through `OPTIMIZER_REGISTRY.get(...)` at startup, instantiates it, and runs `run()`. + +**Note**: `GepaReflectiveAlgo.name` is currently `Literal["gepa_reflective"]`—**new algorithms need a new `pydantic.BaseModel` configuration class** (e.g., `MyOwnAlgo`), and modify `OptimizeConfig.algorithm` field to discriminated union (see `_optimize_config.py:OptimizeConfig` docstring for details). + +### 9.2 Register Custom Stopper + +Source code: `AgentOptimizer.optimize`'s `extra_stop_callbacks` parameter in `_agent_optimizer.py`. + +Inject via `extra_stop_callbacks` at runtime—**no need to modify configuration file**: + +```python +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _LabeledStopper + + +class MySloMonitorStopper: + """Custom stopper: check external SLO monitoring system, stop when threshold is exceeded.""" + + def __init__(self, slo_client): + self._slo = slo_client + self.last_triggered = False + + def __call__(self, gepa_state=None) -> bool: + if self._slo.is_p99_breached(): + self.last_triggered = True + return True + return False + + +# Usage: +stopper = MySloMonitorStopper(slo_client) +result = await AgentOptimizer.optimize( + ..., + extra_stop_callbacks=[ + # Ordinary stopper: stop_reason displays as "completed" + stopper, + + # When wanting stable stop_reason label, use _LabeledStopper wrapper: + # _LabeledStopper(stopper, "slo_breach"), # But "slo_breach" is not in StopReason Literal, pydantic will reject + ], +) +``` + +**Interface contract** (see `_LabeledStopper`): + +- Must have `__call__(self, gepa_state=None) -> bool` method +- `True` means stop +- Should have `last_triggered: bool` attribute for `_classify_stop_reason` to read + +**Two behaviors of `stop_reason`**: + +- Ordinary callable / custom class: `stop_reason` displays as `"completed"` when triggered (gepa doesn't know why you stopped) +- Wrapped with `_LabeledStopper(inner, label)`: `label` must be a legal value of `StopReason` Literal (see `_optimize_result.py`); need to extend Literal type when customizing new label + +### 9.3 Register Custom Evaluation Callback + +Source code: `AgentOptimizer.optimize`'s `extra_gepa_callbacks` parameter in `_agent_optimizer.py`. + +Access gepa internal events through `extra_gepa_callbacks`—typical use: forwarding to dashboard / real-time monitoring metrics. + +```python +class MyDashboardCallback: + def on_proposal_end(self, *args, **kwargs) -> None: + # Report to Grafana / WandB / internal monitoring + ... + + # gepa silently ignores missing methods, just implement part of the protocol methods as needed + + +result = await AgentOptimizer.optimize( + ..., + extra_gepa_callbacks=[MyDashboardCallback()], +) +``` + +**Protocol constraints**: Each callback should implement several methods in `gepa.core.callback.GEPACallback` protocol (`on_iteration_start` / `on_proposal_start` / `on_proposal_end` / `on_valset_breakdown` / ...). **gepa silently ignores missing methods in callback**, so business can only implement those few that they care about. + + +## 10 FAQ + +**Q: Ran once, `bestPassRate` in `result.json` is the same as `baselinePassRate`, `accepted` are all false—is it a bug?** + +Not a bug. Optimization didn't find a candidate better than baseline—`status="SUCCEEDED"` + `finish_reason="no_improvement"` is the typical combination for this situation, `best_prompts` equals `baseline_prompts`. Possible reasons: baseline is already very good, `max_metric_calls` is too small to reach improvement point, training set and validation set have very different distributions, metric noise is too large (recommend increasing `num_runs`). + +--- + +**Q: `update_source=True` crashed during run, were source prompt files corrupted?** + +No. Two layers of protection: (1) When optimization fails (`status="FAILED"`), the framework simply doesn't call `write_all`; (2) Even if `write_all` itself fails, source files are atomically rolled back through tmp + `os.replace` (see [§8.3](#83-atomic-disk-persistence-guarantee) for details). + +--- + +**Q: Can I modify `optimizer.json` mid-run?** + +No. `optimizer.json` is loaded once at startup, subsequent modifications will not be read. Sentinel file `optimize.stop` is the only supported "runtime intervention" (see [§8.2](#82-sentinel-file-letting-users-actively-stop-optimization) for details). + +--- + +**Q: Can I run with a very small training set (< 5 cases)?** + +Yes, but effect is poor: (1) Reflection LM sees too few feedback samples, rewrite direction is unstable; (2) Small training set easily lets advanced configuration overfit (refer to [§4.8](#48)). Recommend at least 5~10 cases; consider manual tuning first when < 5. + +--- + +**Q: How to handle retries when `call_agent` internally sends HTTP / RPC?** + +Handle it yourself within `call_agent`. The framework does not do retries for business at LLM / service call layer—designed to keep `call_agent` as a black box. If the call fails, that case's evaluation score counts as 0, and the reflection LM will see the error message (refer to §5.2 Reflection LM feedback structure). + +--- + +**Q: Can multiple `optimize()` runs happen simultaneously, sharing one `output_dir`?** + +No. Multiple processes writing to one `output_dir`, atomic write constraint protects single files from being half-written, but **multiple processes overwrite files mutually**—`result.json` / `rounds/round_001.json`, etc. will step on each other. Use independent timestamp subdirectory for each run. + +--- + +**Q: When using black-box `call_agent` mode, can I use metrics like `tool_trajectory_avg_score`?** + +No. Black-box `call_agent` mode cannot obtain session traces / tool intermediate_data, the framework will fail-fast and reject at startup (see [§7.1](#71-agentoptimizeroptimize-parameter-table) startup check table for details). Switch to response-level metrics: `final_response_avg_score` / `llm_rubric_response` / `llm_final_response`. + +--- + +**Q: After running with `update_source=False`, source prompts are still in place, but `target_prompt.write_all` was called repeatedly during the process?** + +Yes. The optimizer main loop calls `write_all` every time a new candidate is generated to write the candidate to source files registered with `add_path`—this is to let the next `call_agent` call read the new prompt. **The `finally` phase will automatically `write_all(baseline_snapshot)` to roll back source files to baseline content** (source code: `cleanup_done` sentinel in `optimize` in `_agent_optimizer.py`). So after `update_source=False` finishes running, source files are **completely consistent with before running**—provided that `TargetPrompt.write_all` didn't throw an error during the rollback phase (in extreme cases when it throws an error, the framework will log a warning but will not affect `result.json` / `best_prompts/` artifact production). + +--- + +**Q: How to "re-run" last optimization result?** + +Re-run `runs//config.snapshot.json`—it is the complete configuration snapshot from last time. But LLM output has randomness, even with consistent configuration you may get different best_prompts; fixing the `seed` field can reduce (not eliminate) this randomness. Must lock seed when A/B testing (refer to [§4.8](#48)). diff --git a/docs/mkdocs/zh/optimization.md b/docs/mkdocs/zh/optimization.md new file mode 100644 index 0000000..2a24e1e --- /dev/null +++ b/docs/mkdocs/zh/optimization.md @@ -0,0 +1,2038 @@ +# Prompt č‡Ŗä¼˜åŒ–ļ¼ˆAgentOptimizer) + +`AgentOptimizer` 是 tRPC-Agent-Python ēš„ **prompt č‡Ŗä¼˜åŒ–ęØ”å—**:它把 prompt å·„ēØ‹ēš„čæ­ä»£čæ‡ēØ‹ā€”ā€”å¤±č“„ę”ˆä¾‹åˆ†ęžć€ę”¹å†™ć€å›žå½’éŖŒčÆć€ē‰ˆęœ¬ē®”ē†ā€”ā€”ę•“ä½“ę‰˜ē®”äøŗäø€ę”åÆå¤ēŽ°ēš„č‡ŖåŠØåŒ–ęµę°“ēŗæļ¼ŒęŠŠå·„ēØ‹åøˆä»Žę‰‹å·„čÆ•é”™äø­č§£ę”¾å‡ŗę„ć€‚ + +> **čæ™é‡Œ "prompt" ēš„å¤–å»¶**:在 agent åŗ”ē”Øäø­ļ¼Œprompt äøä»…ęŒ‡ē‹­ä¹‰ēš„ system promptļ¼Œčæ˜åŒ…ę‹¬ę‰€ęœ‰ä»„č‡Ŗē„¶čÆ­čØ€é©±åŠØ agent č”Œäøŗēš„ę–‡ęœ¬čµ„äŗ§ā€”ā€”skill ęčæ°ć€rule 规范态sub-agent ååŒęŒ‡ä»¤ć€å·„å…·ä½æē”ØčÆ“ę˜Žē­‰ć€‚å®ƒä»¬ēš„ęœ¬č“Øéƒ½ę˜Æč¢« LLM č§£čÆ»ēš„č‡Ŗē„¶čÆ­čØ€ę–‡ęœ¬ļ¼›åŖč¦ä¼šå·¦å³ agent å†³ē­–ļ¼Œéƒ½åÆä»„ä½œäøŗ `AgentOptimizer` ēš„ä¼˜åŒ–ē›®ę ‡ć€‚ + +ęØ”å—ē”±å››äøŖå­ęØ”å—ē»„ęˆļ¼ŒåÆ¹å¤–é€ščæ‡å•äø€å…„å£ `AgentOptimizer.optimize` 驱动: + +| å­ęØ”å— | 职蓣 | +|---|---| +| **ä¼˜åŒ–ē®—ę³•** | åę€ā€”čÆ„ä¼°ā€”äæē•™å¾ŖēŽÆļ¼›å½“å‰å†…ē½® [GEPA](https://github.com/gepa-ai/gepa)(Genetic-Evolutionary Pareto,MIT Licenseļ¼‰ļ¼Œé€ščæ‡ `OPTIMIZER_REGISTRY` åÆę‰©å±•ęŽ„å…„å…¶ä»–ē®—ę³• | +| **čÆ„ęµ‹ę”„ęŽ„** | å¤ē”Ø `AgentEvaluator`ļ¼Œč®©ä¼˜åŒ–čæ‡ēØ‹äøŽę—„åøøå›žå½’å…±ē”ØåŒäø€ä»½ `EvalSet` äøŽ metric é…ē½® | +| **Prompt 箔理** | `TargetPrompt` ē»Ÿäø€ęŠ½č±” prompt å­—ę®µēš„čÆ»å†™ļ¼›ę”ÆęŒęœ¬åœ°ę–‡ä»¶ļ¼ˆpathļ¼‰äøŽä»»ę„åŽē«Æļ¼ˆcallbackļ¼‰äø¤ē§ęŗ | +| **čæč”Œē¼–ęŽ’** | čµ„ęŗč°ƒåŗ¦ć€stopperļ¼ˆåœęœŗå™Øļ¼‰ć€äŗ§ē‰©åŽŸå­č½ē›˜ć€SIGINT äæ”å·å®‰å…Ø | + +`AgentOptimizer` 把"prompt 调优"é‡ę–°å®šä¹‰ęˆäø€äøŖ**ęœ‰č¾¹ē•Œć€åÆå¤ēŽ°ć€åÆå®”č®”**ēš„å·„ēØ‹é—®é¢˜ļ¼š + +| 结度 | č”Øč¾¾ę–¹å¼ | +|---|---| +| ä¼˜åŒ–ē›®ę ‡ | `evaluate.metrics[]` —— ę•°å€¼åŒ–ć€åÆé‡å¤čÆ„ä¼°ēš„ęŒ‡ę ‡é›†åˆ | +| å†³ē­–å˜é‡ | `TargetPrompt` ę³Øå†Œēš„ prompt å­—ę®µļ¼ˆäø€äøŖęˆ–å¤šäøŖļ¼‰ | +| ęœē“¢čæ‡ēØ‹ | reflection LMļ¼ˆåę€åž‹ LLMļ¼‰é©±åŠØēš„åę€ā€”čÆ„ä¼°ā€”äæē•™å¾ŖēŽÆļ¼ˆčÆ¦č§ [§5](#5-gepa-ę˜Æę€Žä¹ˆå·„ä½œēš„)) | +| ē»ˆę­¢ę”ä»¶ | 6 ē§å†…ē½® stopper + ē”Øęˆ·č‡Ŗå®šä¹‰åœęœŗå™Øļ¼ˆčÆ¦č§ [§4.7](#47)) | +| 产物 | `OptimizeResult` 对豔 + `runs/<ę—¶é—“ęˆ³>/` å…Øé‡å®”č®”ē›®å½•ļ¼ˆčÆ¦č§ [§8](#8-äŗ§ē‰©äøŽē›®å½•ēŗ¦å®š)) | + +> **å‰ē½®é˜…čÆ»**:[Agent 评测](evaluation.md) —— ä¼˜åŒ–å»ŗē«‹åœØčÆ„ęµ‹ä¹‹äøŠļ¼›ęœ¬ę–‡å‡č®¾čÆ»č€…å·²äŗ†č§£ `EvalSet` äøŽ `metric` ēš„åŸŗęœ¬ę¦‚åæµć€‚ + +--- + +## 1 čæ™ę˜Æä»€ä¹ˆ / č§£å†³ä»€ä¹ˆé—®é¢˜ + +### 1.1 č§£å†³ēš„é—®é¢˜ + +在 agent åŗ”ē”Øčæ›å…„äøšåŠ”å…³é”®é“¾č·ÆåŽļ¼Œprompt(含 skill态rule ē­‰ę‰€ęœ‰é©±åŠØ agent č”Œäøŗēš„č‡Ŗē„¶čÆ­čØ€ę–‡ęœ¬ļ¼‰ę˜Æčæ­ä»£ęˆęœ¬ęœ€é«˜ēš„čµ„äŗ§ä¹‹äø€ļ¼šę‰‹å·„č°ƒä¼˜ä¾čµ–å·„ēØ‹åøˆåÆ¹å¤±č“„ę”ˆä¾‹ēš„å½’ēŗ³čƒ½åŠ›ļ¼Œč§„ęØ”åŒ–åŽå›žå½’é£Žé™©åæ«é€Ÿę”¾å¤§ļ¼›å¤š sub-agent é“¾č·ÆäøŠ prompt å­—ę®µä¹‹é—“ēš„č€¦åˆč®©å•å­—ę®µä¼˜åŒ–å¤±åŽ»ę„ä¹‰ļ¼›ęØ”åž‹å‡ēŗ§ć€å·„å…·å˜ę›“ć€åœŗę™Æę‰©å¼ éƒ½ä¼šč®©"ę˜Øę—„ęœ€ä¼˜"ēš„ prompt åœØä»Šę—„å¤±ę•ˆć€‚ + +`AgentOptimizer` ęØ”å—ęŠŠčæ™å„—čæ­ä»£čæ‡ēØ‹**å®Œę•“åœ°å·„ēØ‹åŒ–**: + +- **ä¼˜åŒ–ē›®ę ‡ę˜¾å¼åŒ–**ā€”ā€”ęŠŠ"ä»€ä¹ˆē®—å„½"沉淀为 metric + threshold ēš„ę•°å€¼å„‘ēŗ¦ļ¼ŒåÆč¢«čÆ„ęµ‹ć€ä¼˜åŒ–ć€CI/CD 共享 +- **ęœē“¢čæ‡ēØ‹ē®—ę³•åŒ–**ā€”ā€”åę€ā€”čÆ„ä¼°ā€”äæē•™å¾ŖēŽÆę›æä»£äŗŗå·„čÆ•é”™ļ¼Œčæ‡ēØ‹åÆé‡ę”¾ć€ē»“ęžœåÆęÆ”č¾ƒ +- **多 prompt č”åˆä¼˜åŒ–**ā€”ā€”ę”ÆęŒåŒę—¶ä¼˜åŒ–å¤šäøŖå­—ę®µļ¼ˆå¦‚ router + worker + summarizer ēš„ęŒ‡ä»¤ć€CLAUDE.md + SKILL.mdļ¼‰ļ¼Œå¹¶é€ščæ‡ GEPA ēš„ merge ęœŗåˆ¶åšč·Øå­—ę®µęœē“¢ +- **čæč”Œčæ‡ēØ‹åÆå®”č®”**ā€”ā€”ęÆč½® reflection č¾“å…„ć€å€™é€‰ę”¹åŠØć€čÆ„ä¼°åˆ†ę•°ć€ęŽ„å—/ę‹’ē»åŽŸå› éƒ½č½ē›˜åˆ° `runs/<ę—¶é—“ęˆ³>/`ļ¼Œę”ÆęŒäŗ‹åŽčæ½ęŗÆ +- **ē»“ęžœåÆęŽ§åÆå›žę»š**——`update_source` å†³å®šę˜Æå¦å›žå†™ęŗ promptļ¼›`TargetPrompt` ęä¾›åŽŸå­å†™å…„äøŽå¤±č“„å›žę»šļ¼Œå†™ē›˜åŠäø­ę–­ęˆ– SIGINT äŗŒę¬”äø­ę–­éƒ½äøä¼šęŸåęŗę–‡ä»¶ + +### 1.2 äøŽčÆ„ęµ‹ęØ”å—ēš„å…³ē³» + +`AgentEvaluator` äøŽ `AgentOptimizer` Ꞅꈐ**čÆ„ęµ‹ā€”ä¼˜åŒ–é—­ēŽÆ**ēš„äø¤ē«Æļ¼š + +| ęØ”å— | 角色 | 输出 | +|---|---|---| +| `AgentEvaluator`([evaluation.md](evaluation.md)) | åŗ¦é‡å½“å‰ prompt ēš„č“Øé‡ | ęÆę” case ēš„ pass/fail + 各 metric 分 | +| `AgentOptimizer`ļ¼ˆęœ¬ę–‡ļ¼‰ | åœØåŗ¦é‡ē»“ęžœäøŠęœē“¢ę›“ä¼˜ prompt | ęœ€ä¼˜ prompt + å…ØēØ‹ä¼˜åŒ–åŽ†å² | + +äŗŒč€…å…±äŗ«åŒäø€ä»½ `EvalSet`ć€åŒäø€å„— metric é…ē½®ć€åŒäø€äøŖ `call_agent`ć€‚äø€ä»½čµ„äŗ§åŒę—¶ę”Æę’‘ę—„åøøå›žå½’ļ¼ˆpytest č·‘ `AgentEvaluator`ļ¼‰äøŽå®šęœŸä¼˜åŒ–ļ¼ˆå¤œé—“ēŖ—å£č·‘ `AgentOptimizer`,详见 [§4.6 CI é—­ēŽÆ](#46))。 + +### 1.3 é€‚ē”Øč¾¹ē•Œ + +`AgentOptimizer` ēš„ęœ‰ę•ˆę€§å–å†³äŗŽäø‰äøŖå‰ęļ¼š + +1. **čÆ„ęµ‹äæ”å·č¶³å¤ŸēØ³å®š**ć€‚åˆ¤åˆ†ęœ¬čŗ«ēš„ę–¹å·®å¤§äŗŽ prompt ę”¹å†™åø¦ę„ēš„ęå‡ę—¶ļ¼Œä¼˜åŒ–ę–¹å‘äøåÆäæ”ć€‚å»ŗč®®å…ˆåœØ `AgentEvaluator` äøŠč·‘ `num_runs=3` č§‚åÆŸ metric č·Øę¬”äø€č‡“ę€§ļ¼Œå†å¼€å§‹ä¼˜åŒ–ć€‚ +2. **é¢„ē®—äøŽęœē“¢ē©ŗé—“åŒ¹é…**ć€‚äø€ę¬”å…øåž‹ēš„å°č§„ęØ”ä¼˜åŒ–åœØ `max_metric_calls=30~60`ļ¼ˆäø€ę¬” case-level 评估算一欔 metric_callļ¼‰é‡ēŗ§ć€reflection LM č°ƒē”Ø 5~20 ę¬”ć€čæč”Œ 1~10 åˆ†é’Ÿć€ę¶ˆč€—å‡ ē¾Žå…ƒåˆ°å‡ åē¾Žå…ƒļ¼ˆčÆ¦č§ [§6 ęˆęœ¬äøŽå¹¶å‘](#6-ęˆęœ¬äøŽå¹¶å‘)ļ¼‰ć€‚é¢„ē®—ę˜¾č‘—ä½ŽäŗŽčÆ„é‡ēŗ§ę—¶ļ¼Œåŗ”å…ˆåœØ `AgentEvaluator` äøŠå®ŒęˆåŸŗēŗæč°ƒä¼˜ć€‚ +3. **prompt ęœ‰åÆä¼˜åŒ–ēš„čÆ­ä¹‰ē»“ęž„**ć€‚å°‘äŗŽ 20 å­—ēš„ē”¬ē¼–ē ęˆ–ä»…ä½œå ä½ę‹¼ęŽ„ēš„ promptļ¼Œęœē“¢ē©ŗé—“čæ‡ēŖ„ļ¼›GEPA åę€åœØčæ™ē§åœŗę™Æäø‹é€€åŒ–äøŗåŒä¹‰ę”¹å†™ć€‚ + +äøåœØä»„äøŠå‰ęå†…ēš„åœŗę™Æļ¼Œåŗ”ä¼˜å…ˆé€‰ę‹© [`AgentEvaluator`](evaluation.md) ęŒē»­č§‚åÆŸļ¼Œč€ŒéžåÆåŠØä¼˜åŒ–ć€‚ + + + + + +## 2 5 分钟 Quickstart + +å®Œę•“ä»£ē äøŽę•°ę®ļ¼š[`examples/optimization/quickstart/`](../../../examples/optimization/quickstart/)怂 + +### 2.1 ē¤ŗä¾‹ä»»åŠ” + +ęœ¬ē¤ŗä¾‹ēš„ agent ę˜Æäø€äøŖ **å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ę±‚č§£å™Ø**ļ¼šęŽ„ę”¶č‡Ŗē„¶čÆ­čØ€ęčæ°ēš„ē®—ęœÆé¢˜ļ¼ˆå¦‚"å°ę˜Žę—©äøŠä¹°äŗ† 4 äøŖč‹¹ęžœļ¼Œäø‹åˆåˆä¹°äŗ† 7 äøŖč‹¹ęžœļ¼Œä»–äø€å…±ęœ‰å¤šå°‘äøŖč‹¹ęžœļ¼Ÿ"ļ¼‰ļ¼Œč¾“å‡ŗåø¦å•ä½ēš„ę•°å­—ē­”ę”ˆļ¼ˆå¦‚"ē­”ę”ˆļ¼š11 äøŖ")。 + +agent č”Œäøŗē”±äø¤äøŖ prompt ę–‡ä»¶å…±åŒé©±åŠØļ¼ŒäŗŒč€…å°±ę˜Æęœ¬ę¬”ä¼˜åŒ–ēš„ē›®ę ‡ļ¼š + +| ä¼˜åŒ–ē›®ę ‡ | 路径 | 在 agent äø­ēš„č§’č‰² | +|---|---|---| +| **system_prompt** | `agent/prompts/system.md` | č§’č‰²äøŽå›žē­”é£Žę ¼å®šä¹‰ļ¼ˆå¦‚"ä½ ę˜Æäø€äøŖę•°å­¦åŠ©ę•™ļ¼Œē”Øęø…ę™°ēš„äø­ę–‡å›žē­”") | +| **skill** | `agent/prompts/skill.md` | č§£é¢˜ę–¹ę³•č®ŗļ¼ˆå¦‚"å…ˆčÆ†åˆ«é¢˜åž‹ → 列式 → 讔算 → å†™å‡ŗåø¦å•ä½ēš„ē­”ę”ˆ") | + +čÆ„ęµ‹ä»Žäø¤äøŖē»“åŗ¦åŒę—¶ę‰“åˆ†ļ¼Œäø¤ę”éƒ½č¾¾ę ‡ę‰ē®— agent é€ščæ‡ļ¼š + +| čÆ„ęµ‹ęŒ‡ę ‡ | ē±»åž‹ | 阈值 | åˆ¤åˆ†ę–¹å¼ | +|---|---|---|---| +| `final_response_avg_score` | ę–‡ęœ¬åŒ¹é… | 1.0 | agent 输出必锻**包含**å‚č€ƒē­”ę”ˆę–‡ęœ¬ļ¼ˆå¦‚ "ē­”ę”ˆļ¼š11 äøŖ"ļ¼‰ļ¼Œå¤§å°å†™äøę•ę„Ÿ | +| `llm_rubric_response` | LLM 裁判 | 0.66 | 由独立 LLM ęŒ‰äø‰ę” rubric ę‰“åˆ†å–å‡å€¼ļ¼šā‘  ē­”ę”ˆę•°å€¼äøŽå‚č€ƒäø€č‡“ ā‘” ęŽØē†ę­„éŖ¤ęø…ę™° ā‘¢ ē­”ę”ˆåø¦ę­£ē”®å•ä½ | + +ę•°ę®é›†č§„ęØ”ļ¼šč®­ē»ƒé›† 5 ę”ć€éŖŒčÆé›† 3 ę”ć€‚ + + +### 2.2 å‡†å¤‡ēŽÆå¢ƒ + +```bash +pip install "trpc-agent-py[optimize]" + +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +`[optimize]` extra 包含 `gepa`ļ¼ˆåę€ē®—ę³•å®žēŽ°ļ¼‰äøŽ `rich`ļ¼ˆē»ˆē«Æčæ›åŗ¦é¢ęæļ¼‰ć€‚ + + +### 2.3 ē›®å½•ē»“ęž„ + +```text +examples/optimization/quickstart/ +ā”œā”€ā”€ agent/ +│ ā”œā”€ā”€ agent.py # 定义 create_agent() å·„åŽ‚å‡½ę•° +│ ā”œā”€ā”€ config.py # ęØ”åž‹ / å‡­ę®ä»ŽēŽÆå¢ƒå˜é‡čÆ»å– +│ └── prompts/ +│ ā”œā”€ā”€ system.md # baseline system promptļ¼ˆå¾…ä¼˜åŒ–ļ¼‰ +│ └── skill.md # baseline skill ę–‡ę”£ļ¼ˆå¾…ä¼˜åŒ–ļ¼‰ +ā”œā”€ā”€ train.evalset.json # 5 ę”č®­ē»ƒē”Øä¾‹ļ¼ˆåę€ minibatch ę„ęŗļ¼‰ +ā”œā”€ā”€ val.evalset.json # 3 ę”éŖŒčÆē”Øä¾‹ļ¼ˆęÆč½®å…Øé‡čÆ„ä¼°ļ¼Œå†³å®šå€™é€‰ę˜Æå¦č¢«ęŽ„å—ļ¼‰ +ā”œā”€ā”€ optimizer.json # 算法 + metric é…ē½® +└── run_optimization.py # å…„å£č„šęœ¬ +``` + +> č®­ē»ƒé›†äøŽéŖŒčÆé›†åæ…é”»ę˜ÆäøåŒę–‡ä»¶ļ¼Œę”†ęž¶åÆåŠØęœŸä¼šę ”éŖŒč·Æå¾„äøé‡åˆć€‚ + +### 2.4 核心代码 + +`run_optimization.py` ē”±äø‰ę®µęž„ęˆļ¼ŒåÆ¹åŗ”ä¼˜åŒ–å™ØåÆ¹å¤–ēš„äø‰äøŖę øåæƒęŠ½č±”ć€‚ + +**ē¬¬äø€ę®µļ¼š`call_agent` —— äøšåŠ”ę”„ęŽ„å‡½ę•°**(详见 [§3.4](#34-call_agent)) + +ē­¾åå›ŗå®šäøŗ `async def(query: str) -> str`ć€‚ę”†ęž¶é€ščæ‡å®ƒé©±åŠØ agent å®Œęˆå•ę¬”ęŽØē†ļ¼›ä»»ę„å½¢ę€ēš„ agent(`LlmAgent`态HTTP ęœåŠ”ć€å­čæ›ēØ‹ CLI ē­‰ļ¼‰éƒ½é€ščæ‡čæ™å±‚ę”„ęŽ„ęŽ„å…„ć€‚ + +```python +async def call_agent(query: str) -> str: + # ęÆę¬”é‡čÆ» prompt ꖇ件 → GEPA å†™å…„ę–°å€™é€‰ē«‹å³ē”Ÿę•ˆ + root_agent = create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=root_agent, + session_service=session_service) + # ... 发送 user_content态ꔶ集 is_final_response äŗ‹ä»¶ + return final_text.strip() +``` + +**第二段:`TargetPrompt` —— ä¼˜åŒ–ē›®ę ‡å£°ę˜Ž**(详见 [§3.3](#33-targetprompt)) + +ę³Øå†Œå“Ŗäŗ› prompt å­—ę®µä¼šč¢«ä¼˜åŒ–å™ØčÆ»å†™ć€‚ęÆäøŖå­—ę®µåÆ¹åŗ”äø€äøŖęœ¬åœ°ę–‡ä»¶ļ¼ˆ`add_path`ļ¼‰ęˆ–äø€åÆ¹å¼‚ę­„čÆ»å†™å›žč°ƒļ¼ˆ`add_callback`ļ¼Œē”ØäŗŽčæœē«Æ KV ē­‰ä»»ę„åŽē«Æļ¼‰ć€‚ + +```python +target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) +) +``` + +**ē¬¬äø‰ę®µļ¼š`AgentOptimizer.optimize` —— ä¼˜åŒ–å™Øč°ƒē”Ø**ļ¼ˆå®Œę•“å‚ę•°č§ [§7.1](#71-agentoptimizeroptimize-å‚ę•°č”Ø)) + +```python +await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(RUNS_DIR / timestamp), + update_source=False, + verbose=1, +) +``` + +| å‚ę•° | čÆ“ę˜Ž | +|---|---| +| `config_path` | `optimizer.json`ļ¼Œå®šä¹‰ metric / 算法 / åœęœŗę”ä»¶ | +| `output_dir` | äŗ§ē‰©ē›®å½•ļ¼›äøå­˜åœØä¼šč‡ŖåŠØåˆ›å»ŗļ¼Œå»ŗč®®ē”Øę—¶é—“ęˆ³å­ē›®å½• | +| `update_source` | `False` åŖäŗ§å‡ŗ `best_prompts/`ļ¼›`True` ä¼˜åŒ–ęˆåŠŸåŽå›žå†™ęŗę–‡ä»¶ļ¼ˆCI åœŗę™Æļ¼ŒčÆ¦č§ [§4.6](#46)) | +| `verbose` | `0` 静默 / `1` Rich čæ›åŗ¦é¢ęæ / `2` 附 gepa čÆŠę–­ę—„åæ— | + +### 2.5 é…ē½®ę–‡ä»¶ `optimizer.json` + +é…ē½®åˆ†äø¤ę®µļ¼š`evaluate`ļ¼ˆčÆ„ęµ‹ļ¼ŒäøŽčÆ„ęµ‹ęØ”å—åŒęŗļ¼‰+ `optimize`ļ¼ˆä¼˜åŒ–å™Øäø“å±žļ¼‰ć€‚ + +```json +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": {"text": {"match": "contains", "case_insensitive": true}} + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "rubrics": [ + {"id": "numeric_correct", "content": {"text": "ē­”ę”ˆę•°å€¼äøŽå‚č€ƒäø€č‡“"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "reasoning_clear", "content": {"text": "ęŽØē†ę­„éŖ¤ęø…ę™°"}, "type": "FINAL_RESPONSE_QUALITY"}, + {"id": "units_present", "content": {"text": "ē­”ę”ˆåø¦ę­£ē”®å•ä½"}, "type": "FINAL_RESPONSE_QUALITY"} + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": {"model_name": "...", "base_url": "...", "api_key": "..."}, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} +``` + +ęœ¬ē¤ŗä¾‹ē”Øåˆ°ēš„å…³é”®ę¦‚åæµļ¼š + +| 概念 | åœØé…ē½®äø­ēš„ä½ē½® | äø€å„čÆčÆ“ę˜Ž | 详见 | +|---|---|---|---| +| **metric** | `evaluate.metrics[]` | čÆ„ęµ‹ęŒ‡ę ‡åˆ—č”Øļ¼›å¤šę”åÆå åŠ ļ¼ŒęÆę”ē‹¬ē«‹ę‰“åˆ† | [§4.5](#45) | +| **LLM judge** | `criterion.llm_judge` | LLM č£åˆ¤ļ¼ŒęŒ‰ rubric ę‰“åˆ†ļ¼›ęœ¬ä¾‹äøŗ `llm_rubric_response` ęœåŠ” | [§4.5](#45) | +| **stop.required_metrics** | `optimize.stop.required_metrics` | ę”†ęž¶ēŗ§åœęœŗļ¼šå“Ŗäŗ› metric åæ…é”»åŒę—¶č¾¾é˜ˆå€¼ę‰åœ | [§7.3.5](#735-optimizestop-ꮵ) | +| **reflection_lm** | `optimize.algorithm.reflection_lm` | åę€åž‹ LLMļ¼ŒęÆč½®ę£€č§†å¤±č“„ē”Øä¾‹å¹¶ē”Ÿęˆę–°å€™é€‰ prompt | [§3.8](#38-reflection-lm) / [§6.5](#65-reflection-lm-é€‰åž‹å»ŗč®®č”Ø) | +| **candidate_selection_strategy** | `optimize.algorithm` | ęÆč½®ęŒ‘å“ŖäøŖå€™é€‰ä½œäøŗåę€ parent | [§7.3.3](#733-optimizealgorithm-ꮵ) | +| **module_selector** | `optimize.algorithm` | å¤šå­—ę®µä¼˜åŒ–ę—¶ęÆč½®é€‰å“ŖäøŖå­—ę®µę”¹å†™ | [§4.3](#43) | +| **reflection_minibatch_size** | `optimize.algorithm` | ęÆč½®åę€ä»Ž train ęŠ½å‡ ę” case | [§5](#5-gepa-ę˜Æę€Žä¹ˆå·„ä½œēš„) | +| **stopper** | `optimize.algorithm.max_*` / `timeout_seconds` / `score_threshold` | ē®—ę³•å±‚åœęœŗę”ä»¶ļ¼Œč‡³å°‘éœ€č¦č®¾ē½®äø€äøŖ | [§4.7](#47) / [§7.3.3](#733-optimizealgorithm-ꮵ) | + +å®Œę•“å­—ę®µå‚č€ƒč§ [§7.3](#73-optimizerjson-é…ē½®é”¹č”Ø)怂 + + +### 2.6 运蔌 + +```bash +python examples/optimization/quickstart/run_optimization.py +``` + +ē»ˆē«Æä¾åŗč¾“å‡ŗļ¼šbaseline čÆ„ä¼°åˆ†ę•° → ęÆč½®åę€ēš„ęŽ„å—/ę‹’ē»č®°å½• → ę”¶å°¾ę‘˜č¦ć€‚å°č§„ęØ”é…ē½®äø‹ 1~3 åˆ†é’Ÿå®Œęˆć€‚ + +![Quickstart ē»ˆē«Æč¾“å‡ŗē¤ŗä¾‹](../assets/imgs/optimization_quickstart.png) + +### 2.7 产物 + +```text +runs/<ę—¶é—“ęˆ³>/ +ā”œā”€ā”€ result.json # å®Œę•“čæč”Œč®°å½•ļ¼ˆOptimizeResult åŗåˆ—åŒ–ļ¼‰ +ā”œā”€ā”€ summary.txt # äŗŗē±»åÆčÆ»ę€»č§ˆļ¼ˆé¦–å…ˆēœ‹čæ™äøŖļ¼‰ +ā”œā”€ā”€ run.log # å•č”ŒēŠ¶ę€ +ā”œā”€ā”€ config.snapshot.json # č¾“å…„é…ē½®ēš„åæ«ē…§å‰Æęœ¬ +ā”œā”€ā”€ rounds/round_NNN.json # ęÆč½® RoundRecord +ā”œā”€ā”€ baseline_prompts/<字段>.md # ä¼˜åŒ–å‰åæ«ē…§ +└── best_prompts/<字段>.md # ä¼˜åŒ–åŽęœ€ä½³å€™é€‰ļ¼ˆä»… SUCCEEDED) +``` + +`summary.txt` å…³é”®č”Œļ¼š + +```text +Optimization complete | status=SUCCEEDED | algorithm=gepa_reflective +pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +rounds : 3 accepted / 7 total +duration : 124.31s +stop_reason : required_metrics_passing +update_source : false +``` + +> **ä»€ä¹ˆę˜Æ pass_rate?** +> +> pass_rate č””é‡ēš„ę˜Æļ¼š**ä½ ēš„ agent åœØéŖŒčÆé›†äøŠ"åšåÆ¹äŗ†"å¤šå°‘ęÆ”ä¾‹ēš„é¢˜**怂 +> +> --- +> +> **ē¬¬äø€ę­„ļ¼šęÆäøŖ metric ē‹¬ē«‹åˆ¤å®šč¾¾ę ‡/ęœŖč¾¾ę ‡** +> +> ęÆäøŖ metric ęœ‰č‡Ŗå·±ēš„é˜ˆå€¼ļ¼ˆthresholdļ¼‰ļ¼Œåˆ†ę•° ≄ é˜ˆå€¼å°±č¾¾ę ‡ļ¼Œå¦åˆ™ęœŖč¾¾ę ‡ć€‚ +> +> **ē¬¬äŗŒę­„ļ¼šäø€ę” case ēš„é€ščæ‡č§„åˆ™ā€”ā€”ę‰€ęœ‰ metric éƒ½č¾¾ę ‡ę‰ē®—é€ščæ‡** +> +> å°±åƒč€ƒčÆ•åŒę—¶č€ƒčÆ­ę–‡å’Œę•°å­¦ļ¼Œäø¤ē§‘éƒ½åŠę ¼ę‰ē®—"é€ščæ‡"ļ¼Œä»»ä½•äø€ē§‘äøåŠę ¼å°±ę˜Æ"失蓄"怂 +> +> **ē¬¬äø‰ę­„ļ¼špass_rate = é€ščæ‡ēš„ case ꕰ Ć· ꀻ case ꕰ** +> +> --- +> +> **å®Œę•“ē¤ŗä¾‹**ļ¼šå‡č®¾éŖŒčÆé›†ęœ‰ 4 ę” caseļ¼Œé…äŗ† 3 äøŖ metric: +> +> | | metric_Aļ¼ˆé˜ˆå€¼ 0.8) | metric_Bļ¼ˆé˜ˆå€¼ 0.6) | metric_Cļ¼ˆé˜ˆå€¼ 1.0) | čæ™ę” case é€ščæ‡äŗ†å—ļ¼Ÿ | +> | --- | --- | --- | --- | --- | +> | case_1 | 得分 0.9 āœ… | 得分 0.7 āœ… | 得分 1.0 āœ… | **é€ščæ‡**(3 äøŖéƒ½č¾¾ę ‡ļ¼‰ | +> | case_2 | 得分 0.85 āœ… | 得分 0.4 āŒ | 得分 1.0 āœ… | **失蓄**(metric_B 沔达标) | +> | case_3 | 得分 0.6 āŒ | 得分 0.8 āœ… | 得分 0.0 āŒ | **失蓄**(metric_A态C 沔达标) | +> | case_4 | 得分 0.95 āœ… | 得分 0.9 āœ… | 得分 1.0 āœ… | **é€ščæ‡**(3 äøŖéƒ½č¾¾ę ‡ļ¼‰ | +> +> é€ščæ‡ 2 ę”ļ¼Œę€»å…± 4 ę”ļ¼š +> +> ``` +> pass_rate = 2 / 4 = 0.5 +> ``` +> +> --- +> +> **å›žåˆ°äøŠé¢ēš„ summary.txt**: +> +> ``` +> pass_rate : 0.5000 -> 0.8500 (+0.3500, improved) +> ``` +> +> ę„ę€ę˜Æļ¼šä¼˜åŒ–å‰ agent åŖčƒ½åšåÆ¹äø€åŠēš„ caseļ¼Œä¼˜åŒ–åŽčƒ½åšåÆ¹ 85%ć€‚ęå‡äŗ† 35 äøŖē™¾åˆ†ē‚¹ć€‚ +> +> **三个相关字段**: +> +> | 字段 | 含义 | +> | --- | --- | +> | `baseline_pass_rate` | ä¼˜åŒ–å‰ēš„é€ščæ‡ēŽ‡ļ¼ˆē”Øåˆå§‹ prompt č·‘å‡ŗę„ēš„åˆ†ę•°ļ¼‰ | +> | `best_pass_rate` | ä¼˜åŒ–čæ‡ēØ‹äø­ę‰¾åˆ°ēš„ęœ€é«˜é€ščæ‡ēŽ‡ | +> | `pass_rate_improvement` | `best - baseline`ļ¼Œęœ¬ę¬”ä¼˜åŒ–ēš„ęå‡å¹…åŗ¦ | + +å„å­—ę®µå®Œę•“å«ä¹‰č§ [§8 äŗ§ē‰©äøŽē›®å½•ēŗ¦å®š](#8-äŗ§ē‰©äøŽē›®å½•ēŗ¦å®š)怂 + +### 2.8 下一歄 + +| ä½ ēš„äø‹äø€äøŖé—®é¢˜ | č·³č½¬ē« čŠ‚ | +|---|---| +| äøŠé¢čæ™äŗ› API ę¦‚åæµåˆ°åŗ•ę˜Æä»€ä¹ˆ | [§3 ę øåæƒę¦‚åæµ](#3-ę øåæƒę¦‚åæµ) | +| ęˆ‘ēš„ agent äøę˜Æčæ™ē§ęœ¬åœ° LlmAgentļ¼Œę€Žä¹ˆęŽ„å…„ļ¼Ÿ | [§4 ä½ ēš„åœŗę™Æ → ę€Žä¹ˆęŽ„å…„](#4-ä½ ēš„åœŗę™Æ--ę€Žä¹ˆęŽ„å…„) | +| åę€ā€”čÆ„ä¼°ā€”äæē•™å¾ŖēŽÆęÆäø€ę­„å…·ä½“åœØåšä»€ä¹ˆ | [§5 GEPA ę˜Æę€Žä¹ˆå·„ä½œēš„](#5-gepa-ę˜Æę€Žä¹ˆå·„ä½œēš„) | +| ęƒ³ä¼°ē®— LLM č°ƒē”Øęˆęœ¬ / č°ƒę•“å¹¶å‘å‚ę•° | [§6 ęˆęœ¬äøŽå¹¶å‘](#6-ęˆęœ¬äøŽå¹¶å‘) | +| ęƒ³ē›“ęŽ„ęŸ„å‚ę•° / é…ē½®é”¹ | [§7 å®Œę•“ API å‚č€ƒ](#7-å®Œę•“-api-å‚č€ƒ) | + + + +## 3 ę øåæƒę¦‚åæµ + +> čæ™čŠ‚ē”Ø 8 个概念建立 optimization ęØ”å—ēš„"åæƒę™ŗęØ”åž‹"ć€‚ęÆäøŖę¦‚åæµéƒ½ä»Ž"å®ƒåÆ¹åŗ”ä½ å·„ä½œé‡Œēš„ä»€ä¹ˆ"åˆ‡å…„ļ¼Œč€Œäøę˜Æä»Žē±»åž‹ē­¾ååˆ‡å…„ć€‚ä»‹ē»é”ŗåŗäøŽ [§2.4 核心代码](#24-核心代码)äø­äø‰ę®µä»£ē ēš„å‡ŗēŽ°é”ŗåŗäø€č‡“ć€‚ + +### 3.1 ęØ”å—ę•“ä½“ę•°ę®ęµ + +optimization ęØ”å—ēš„å·„ä½œå›žč·Æļ¼šē”Øęˆ·č¾“å…„ 4 ē±»čµ„äŗ§ļ¼ŒęØ”å—åœØåę€ā€”čÆ„ä¼°ā€”äæē•™å¾ŖēŽÆé‡Œäŗ§å‡ŗ 2 ē±»ē»“ęžœć€‚ + +```text + +---> 评估候选 + | | + call_agent ---+ | v + | | åę€å¤±č“„ + optimizer.json ---+ | | + | | v ---> OptimizeResult + +------>| å†™ē›˜ę–°å€™é€‰ (å†…å­˜čæ”å›ž) + TargetPrompt ---+ | | + + | | v runs/<ę—¶é—“ęˆ³>/ + EvalSet x 2 ---+ | ęŽ„å—ę–° best? (宔讔目录) + | 是:äæē•™ / 否:丢弃 + | | + +---------+ +``` + +å››ē±»č¾“å…„ēš„č§’č‰²ļ¼š + +| 输兄 | 形态 | åœØå¾ŖēŽÆäø­ēš„ä½œē”Ø | +| --- | --- | --- | +| `call_agent` | `async (str) -> str` | 把 query é€ē»™äøšåŠ” agentļ¼›ä¼˜åŒ–å™Øä»„ę­¤é‡‡ę ·č”Œäøŗ | +| `optimizer.json` | JSON é…ē½® | å®šä¹‰čÆ„ęµ‹ęŒ‡ę ‡ļ¼ˆ`evaluate.metrics`ļ¼‰äøŽē®—ę³•å‚ę•°ļ¼ˆ`optimize.algorithm`) | +| `TargetPrompt` | å¤šå­—ę®µ prompt ę³Øå†Œč”Ø | å£°ę˜Žå“Ŗäŗ› prompt ꖇ件 / čæœē«Æé…ē½®ä½ę˜Æä¼˜åŒ–ē›®ę ‡ | +| `EvalSet Ɨ 2` | 两份 evalset | č®­ē»ƒé›†ä¾›åę€ LM ēœ‹å¤±č“„ę”ˆä¾‹ļ¼ŒéŖŒčÆé›†ä¾›ę‰“åˆ† / ę—©åœåˆ¤å®š | + +äø¤ē±»äŗ§å‡ŗēš„åŽ»å‘ļ¼š + +| 产出 | 形态 | å…øåž‹ē”Øé€” | +| --- | --- | --- | +| `OptimizeResult` | `optimize()` čæ”å›žēš„å†…å­˜åÆ¹č±” | ēØ‹åŗčÆ»å–ļ¼ˆbaseline / best / 各 round ę˜Žē»†ļ¼‰ | +| `runs/<ę—¶é—“ęˆ³>/` | 宔讔目录 | äŗŗå·„ review态CI č§£ęžć€å¤č·‘ļ¼ˆčÆ¦č§ [§8](#8-äŗ§ē‰©äøŽē›®å½•ēŗ¦å®š)) | + +### 3.2 call_agent + +**äø€å„čÆ**ļ¼šä½ ēš„äøšåŠ” agent ēš„"é€šē”Øę’å¤“"怂 + +**äøŗä»€ä¹ˆéœ€č¦**ļ¼šä½ ēš„ agent åÆčƒ½ę˜Æęœ¬åœ° `LlmAgent`ć€åÆčƒ½ę˜ÆéƒØē½²å„½ēš„ HTTP ęœåŠ”ć€åÆčƒ½ę˜Æ `claude` / `codex` čæ™ē§é»‘ē›’ CLIć€‚ęØ”å—äøåÆčƒ½äøŗęÆē§å½¢ę€å†™é€‚é…å™Øļ¼›ä½ åŖéœ€č¦ęŠŠ"给一段 query → ę‹æåˆ° agent ęœ€ē»ˆå›žå¤"čæ™äøŖåŠØä½œåŒ…ęˆäø€äøŖ async å‡½ę•°ļ¼ŒęØ”å—é€ščæ‡å®ƒé©±åŠØ agent 跑评测。 + +**ę€Žä¹ˆē”Ø**: + +```python +async def call_agent(query: str) -> str: + # ä½ ēš„å®žēŽ°ļ¼šč°ƒęœ¬åœ° agent / HTTP ęœåŠ” / 子进程 CLI 都蔌 + # å…³é”®ē‚¹ļ¼šęÆę¬”éƒ½é‡čÆ» prompt ę–‡ä»¶ļ¼ˆč®© GEPA å†™å…„ēš„ę–°å€™é€‰ē«‹å³ē”Ÿę•ˆļ¼‰ + root_agent = create_agent() + runner = Runner(...) + return await run_and_collect_final_response(runner, query) +``` + +ē­¾åå›ŗå®šäøŗ `async (str) -> str`ļ¼Œäøčƒ½å¤šå‚ę•°ä¹Ÿäøčƒ½åŒę­„ć€‚ + +**ę”†ęž¶åœØäø‰äøŖę—¶ęœŗč°ƒē”Øå®ƒ**: + +| ę—¶ęœŗ | é¢‘ēŽ‡ | +|---|---| +| baseline 评估 | ęÆę” val case Ɨ `num_runs` | +| ęÆč½®åę€ēš„ minibatch 评估 | ęÆę”ęŠ½ę · case 1 ꬔ | +| ęÆč½®å€™é€‰ēš„éŖŒčÆé›†čÆ„ä¼° | ęÆę” val case Ɨ `num_runs` | + +### 3.3 TargetPrompt + +**äø€å„čÆ**ļ¼šå‘ŠčÆ‰ęØ”å—"哪些 prompt ę–‡ä»¶ę˜Æč¦č¢«ä¼˜åŒ–ēš„"ļ¼Œē›øå½“äŗŽ**ä¼˜åŒ–ē›®ę ‡ēš„ę³Øå†Œč”Ø**怂 + +**äøŗä»€ä¹ˆéœ€č¦**:agent é”¹ē›®é‡Œ prompt é€šåøøåˆ†ę•£åœØå¤šäøŖę–‡ä»¶ē”šč‡³å¤šäøŖåŽē«Æļ¼ˆsystem.md / skill.md / čæ˜ęœ‰ę”¾åœØäøƒå½©ēŸ³ēš„ē‰ˆęœ¬ļ¼‰ļ¼›ęØ”å—éœ€č¦ēŸ„é“ļ¼š**åę€å‡ŗę–°å€™é€‰ę—¶ļ¼Œåŗ”čÆ„ęŠŠå®ƒå†™åˆ°å“Ŗé‡Œć€čÆ» baseline ę—¶åŗ”čÆ„ä»Žå“Ŗé‡ŒčÆ»**怂`TargetPrompt` å°±ę˜Æčæ™äøŖ"åœ°å€ē°æ"怂 + +**ę€Žä¹ˆē”Ø**: + +```python +from trpc_agent_sdk.evaluation import TargetPrompt + +target = ( + TargetPrompt() + .add_path("system_prompt", "agent/prompts/system.md") # ę–‡ä»¶åž‹ + .add_path("skill", "agent/prompts/skill.md") # ę–‡ä»¶åž‹ + .add_callback("rule", # å›žč°ƒåž‹ļ¼ˆčæœē«Æ KV) + read=load_rule_from_kv, + write=save_rule_to_kv) +) +``` + +ęÆäøŖå­—ę®µ `name`ļ¼ˆå¦‚ `"system_prompt"`ļ¼‰åœØä½ ä¼˜åŒ–ē»“ęŸåŽä¼šå˜ęˆļ¼š + +- `result.best_prompts["system_prompt"]` —— ēØ‹åŗčÆ»ęœ€ä¼˜ prompt +- `runs/<ę—¶é—“ęˆ³>/best_prompts/system_prompt.md` —— äŗŗčÆ»ęœ€ä¼˜ prompt +- `RoundRecord.optimized_field_names` é‡Œēš„å…ƒē“  —— ēœ‹ęÆč½®ę”¹äŗ†å“ŖäøŖå­—ę®µ + +**äø¤ē§ęŗ**: + +| 源 | 适用 | ę”†ęž¶åšä»€ä¹ˆ | +|---|---|---| +| `add_path(name, path)` | prompt åœØęœ¬åœ°ę–‡ä»¶ | å†™ē›˜čµ° tmp + `os.replace` åŽŸå­å†™ļ¼Œå¤šå­—ę®µå¤±č“„å›žę»šęŗę–‡ä»¶ | +| `add_callback(name, *, read, write)` | prompt åœØčæœē«Æé…ē½®äø­åæƒ / ę•°ę®åŗ“ / git ē­‰ä»»ę„åŽē«Æ | č°ƒä½ ēš„ `read` / `write` async å‡½ę•°ļ¼ŒåŽŸå­ę€§ē”±ä½ äæčÆ | + +å®Œę•“ API 见 [§7.2](#72-targetprompt-api-蔨)怂 + +### 3.4 AgentOptimizer + +**äø€å„čÆ**ļ¼šęØ”å—ēš„"å¼€ęœŗęŒ‰é’®"怂 + +**äøŗä»€ä¹ˆéœ€č¦**ļ¼šä½ äøä¼šęƒ³č‡Ŗå·±ę‰‹å†™"čÆ»é…ē½® → ę ”éŖŒč¾“å…„ → č·‘åę€å¾ŖēŽÆ → č½ē›˜ → 拼 result"这一敓儗流程;`AgentOptimizer` ęŠŠčæ™å„—ęµēØ‹å°č£…ęˆäø€äøŖč°ƒē”Øā€”ā€”ä½ ē»™å®ƒ**输兄**ļ¼Œå®ƒčæ”å›ž**ē»“ęžœ**怂 + +**ę€Žä¹ˆē”Ø**: + +```python +from trpc_agent_sdk.evaluation import AgentOptimizer + +result = await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs/2026-05-19T17-00-00", +) +print(result.best_pass_rate) +``` + +ę•“äøŖęØ”å—åŖęœ‰čæ™äø€äøŖå…¬å¼€å…„å£ļ¼Œ**ę²”ęœ‰åˆ«ēš„ę–¹å¼åÆåŠØä¼˜åŒ–**怂 + +**å®ƒåšäŗ†ä»€ä¹ˆ**: + +1. 加载并栔验 `optimizer.json`(schema äøåÆ¹å°±åœØč·‘ä¹‹å‰ęŠ›é”™ļ¼‰ +2. 栔验 `call_agent` 是 async 函数 / `target_prompt` č‡³å°‘ę³Øå†Œäø€äøŖå­—ę®µ / č®­ē»ƒé›† ≠ éŖŒčÆé›† +3. č·‘åę€ā€”čÆ„ä¼°ā€”äæē•™å¾ŖēŽÆ +4. ęŠŠäŗ§ē‰©č½ē›˜åˆ° `output_dir/` +5. čæ”å›žäø€äøŖ `OptimizeResult` 对豔 + +`optimize` 共 11 äøŖ keyword-only å‚ę•°ļ¼Œåøøē”Ø 6 个见 [§2.4](#24-核心代码)ļ¼Œå…ØéƒØå‚ę•°čÆ¦č§ [§7.1](#71-agentoptimizeroptimize-å‚ę•°č”Ø)怂 + +**`update_source` 决策蔨**ļ¼ˆę‰€ęœ‰ §4.x åœŗę™Æå…±äŗ«ēš„å…³é”®å‚ę•°ļ¼‰ļ¼šå†³å®šä¼˜åŒ–ęˆåŠŸåŽę˜Æå¦ęŠŠęœ€ä¼˜å€™é€‰**回写**到 `TargetPrompt` ę³Øå†Œēš„ęŗ prompt 文件—— + +| `update_source` | ä¼˜åŒ–ęˆåŠŸåŽåšä»€ä¹ˆ | ē”Ÿę•ˆč·Æå¾„ | é€‚ē”Øåœŗę™Æ | +|---|---|---|---| +| `False`ļ¼ˆé»˜č®¤ļ¼‰ | åŖęŠŠęœ€ä¼˜å€™é€‰å†™åˆ° `output_dir/best_prompts/` | ä½ **äŗŗå·„** review → å¤åˆ¶åˆ°ēŗæäøŠ prompt ꖇ件 → äø‹äø€ę¬”č°ƒē”Øē”Ÿę•ˆ | ē°åŗ¦äøŠēŗæć€éœ€č¦äŗŗå·„å®”ę øć€äøåøŒęœ›ä¼˜åŒ–å™Øē›“ęŽ„åŠØēŗæäøŠę–‡ä»¶ | +| `True` | ē”Øęœ€ä¼˜å€™é€‰**ē›“ęŽ„č¦†ē›–**源 prompt ꖇ件 | äøšåŠ”äø‹äø€ę¬”č°ƒē”Ø**ē«‹å³**č‡ŖåŠØē”ØäøŠę–° prompt | č‡ŖåŠØåŒ–é—­ēŽÆļ¼ˆå¦‚å¤œé—“ä¼˜åŒ–ä»»åŠ”ļ¼ŒčÆ¦č§ [§4.6 CI é—­ēŽÆ](#46)) | + +ę— č®ŗé€‰å“Ŗē§ļ¼ŒäøšåŠ”ä¾§**é›¶é‡åÆć€é›¶ä»£ē ę”¹åŠØ**ā€”ā€”ę„ŸēŸ„ prompt å˜åŒ–ēš„ę–¹å¼å§‹ē»ˆę˜Æ"äø‹äø€ę¬”č°ƒē”Øé‡čÆ»ę–‡ä»¶"怂 + +> `update_source=True` ēš„å®‰å…ØäæčÆļ¼šč¦†ē›–é‡‡ē”Ø tmp + `os.replace` åŽŸå­å†™ļ¼›å¦‚ęžœä¼˜åŒ–äø­é€”å¼‚åøøęˆ– SIGINT äø­ę–­ļ¼Œęŗ prompt ꖇ件**äøä¼šč¢«åŠå†™**ļ¼ŒäæęŒåŽŸå†…å®¹ļ¼ˆčÆ¦č§ [§8.3 åŽŸå­č½ē›˜](#83-åŽŸå­č½ē›˜äæčÆ))。 + +### 3.5 optimizer.json + +**äø€å„čÆ**ļ¼šäø€ä»½é…ē½®ę–‡ä»¶ļ¼Œå‘ŠčÆ‰ęØ”å—"ä»€ä¹ˆē®—å„½"和"ę€Žä¹ˆęœē“¢"怂 + +**äøŗä»€ä¹ˆéœ€č¦**:metric é˜ˆå€¼ć€minibatch å¤§å°ć€reflection LM é…ē½®ć€åœęœŗę”ä»¶ā€¦ā€¦čæ™äŗ›å‚ę•°å¦‚ęžœę•£åœØä»£ē é‡Œļ¼ŒęÆę¬”č·‘å®žéŖŒéƒ½č¦ę”¹ä»£ē ć€‚é›†äø­åˆ°äø€äøŖ JSON ę–‡ä»¶åŽļ¼Œč°ƒå‚ = 改 JSONļ¼ŒåÆé‡ēŽ°ę€§ä¹Ÿę›“å„½ļ¼ˆäŗ§ē‰©é‡Œä¼šäæå­˜äø€ä»½ `config.snapshot.json`)。 + +**é•æä»€ä¹ˆę ·**:[§2.5](#25-é…ē½®ę–‡ä»¶-optimizerjson) å·²ē»ēœ‹čæ‡å®Œę•“ē¤ŗä¾‹ć€‚ē»“ęž„äøŠåˆ†äø¤ę®µļ¼š + +```text +{ + "evaluate": { ... }, # äøŽ AgentEvaluator 同 schema:metric åˆ—č”Ø + num_runs + "optimize": { + "eval_case_parallelism": 2, + "stop": { # ę”†ęž¶ēŗ§åœęœŗļ¼šå“Ŗäŗ› metric åæ…é”»č¾¾é˜ˆå€¼ + "required_metrics": "all" + }, + "algorithm": { # ē®—ę³•äø“å±žļ¼šreflection_lm / minibatch / 6 ē§ stopper + "name": "gepa_reflective", + ... + } + } +} +``` + +**äø¤ę®µēš„åˆ†å·„**: + +- `evaluate` 段:**å®Œå…Øå¤ē”Ø**čÆ„ęµ‹ęØ”å—ēš„ schemać€‚ä½ ē»™čÆ„ęµ‹é”¹ē›®å†™čæ‡ēš„ metric é…ē½®ļ¼ŒåÆä»„ē›“ęŽ„ę‹·čæ‡ę„ +- `optimize` 段:**ä¼˜åŒ–å™Øäø“å±ž**。其中 `algorithm.name` ę˜Æē®—ę³•é€‰ę‹©å™Øļ¼Œē›®å‰å”Æäø€åÆé€‰å€¼ę˜Æ `"gepa_reflective"`ļ¼ŒęœŖę„ę‰©å±•ę–°ē®—ę³•ę—¶é€ščæ‡ [§9.2 ę³Øå†Œę–°ē®—ę³•](#92) 增加 + +å®Œę•“å­—ę®µč”Øč§ [§7.3](#73-optimizerjson-é…ē½®é”¹č”Ø)怂 + +### 3.6 EvalSet / EvalCase + +**äø€å„čÆ**ļ¼šč®­ē»ƒé›† + éŖŒčÆé›†ļ¼Œę ¼å¼äøŽčÆ„ęµ‹ęØ”å—å®Œå…Øē›øåŒć€‚ + +**äøŗä»€ä¹ˆéœ€č¦åˆ†äø¤äøŖę–‡ä»¶**: + +- **č®­ē»ƒé›†**ļ¼šęØ”å—ęÆč½®ä»Žäø­**éšęœŗęŠ½**å‡ ę” case(`reflection_minibatch_size`,默认让 gepa å†³å®šļ¼‰ē»™ reflection LM ēœ‹å¤±č“„ę”ˆä¾‹ → ē”Øę„"找改进方向" +- **éŖŒčÆé›†**ļ¼šęÆäøŖę–°å€™é€‰ē”ŸęˆåŽļ¼ŒåœØå®ƒäøŠé¢**å…Øé‡č·‘**ē®—åˆ† → ē”Øę„"éŖŒčÆå€™é€‰ę˜Æå¦ēœŸēš„ę›“å„½" + +**äøŗä»€ä¹ˆåæ…é”»ę˜ÆäøåŒę–‡ä»¶**ļ¼šč®­ē»ƒé›†å†³å®šäŗ† reflection LM ēœ‹åˆ°ä»€ä¹ˆļ¼ŒéŖŒčÆé›†å†³å®šäŗ†å€™é€‰ę˜Æå¦č¢«ęŽ„å—ć€‚å¦‚ęžœäø¤č€…é‡åˆļ¼Œå°±ęˆäŗ†"ē”Øč€ƒé¢˜åˆ·é¢˜ć€å†ē”Øč€ƒé¢˜åˆ¤åˆ†"ā€”ā€”ę‹æåˆ°ēš„ best_pass_rate äøåÆäæ”ć€‚ę”†ęž¶åÆåŠØęœŸä¼šęÆ”åÆ¹č·Æå¾„ļ¼ˆ`os.path.normpath(os.path.abspath(...))`ļ¼‰é˜²å¾”čæ™äø€ē‚¹ļ¼Œé‡åˆē›“ęŽ„ęŠ› `ValueError`怂 + +ę ¼å¼äøŽē¼–å†™ęŒ‡å¼•č§ [čÆ„ęµ‹é›†ē¼–å†™ęŒ‡å—](evaluation.md#评测集evalsetē¼–å†™ęŒ‡å—)怂 + +### 3.7 OptimizeResult + +**äø€å„čÆ**ļ¼šäø€ę¬”ä¼˜åŒ–č·‘å®ŒåŽēš„"å…ØéƒØäŗ§å‡ŗ"ļ¼Œę—¢ę˜Æ `optimize()` ēš„čæ”å›žå€¼ļ¼Œä¹Ÿę˜Æ `runs/<ę—¶é—“ęˆ³>/result.json` ēš„å†…å®¹ć€‚ + +**äøŗä»€ä¹ˆéœ€č¦å®ƒ**ļ¼šä½ č·‘å®Œä¼˜åŒ–ęœ€å…³åæƒäø‰ä»¶äŗ‹ā€”ā€”ęˆåŠŸäŗ†å— / ęå‡å¤šå°‘ / ęœ€ä¼˜ prompt ę˜Æä»€ä¹ˆć€‚`OptimizeResult` ęŠŠå®ƒä»¬ę‰“åŒ…ļ¼š + +```python +result = await AgentOptimizer.optimize(...) + +# 1. ęˆåŠŸäŗ†å— +if result.status == "SUCCEEDED": + ... + +# 2. ęå‡å¤šå°‘ +print(f"{result.baseline_pass_rate:.2%} → {result.best_pass_rate:.2%}, " + f"+{result.pass_rate_improvement:.2%}") + +# 3. ęœ€ä¼˜ prompt ę˜Æä»€ä¹ˆ +new_system_prompt = result.best_prompts["system_prompt"] +new_skill = result.best_prompts["skill"] +``` + +å®ƒčæ˜ęŗåø¦čæ‡ēØ‹ę•°ę®ļ¼ˆęÆč½®å‘ē”Ÿäŗ†ä»€ä¹ˆć€reflection LM č°ƒē”Øę¬”ę•°ć€ę€»č€—ę—¶ē­‰ļ¼‰ä¾›äŗ‹åŽåˆ†ęžć€‚ + +**ęœ€åøøēœ‹ēš„ 6 个字段**: + +| 字段 | ē±»åž‹ | 含义 | +|---|---|---| +| `status` | `"SUCCEEDED"` / `"FAILED"` / `"CANCELED"` | ē»ˆę€ | +| `baseline_pass_rate` / `best_pass_rate` | `float` | 优化前 / 后 pass rate | +| `pass_rate_improvement` | `float` | äŗŒč€…å·®å€¼ | +| `best_prompts` | `dict[str, str]` | å­—ę®µå → ęœ€ä¼˜ prompt ę–‡ęœ¬ | +| `rounds` | `list[RoundRecord]` | ęÆč½®č®°å½• | +| `stop_reason` | `Literal[...]` ꈖ `None` | 哪个 stopper č§¦å‘ēš„åœęœŗ | + +å®Œę•“ 22 å­—ę®µļ¼ˆå« `RoundRecord`)见 [§7.4](#74-optimizeresult--roundrecord-字段蔨)怂 + +### 3.8 Reflection LM + +**äø€å„čÆ**ļ¼šęØ”å—å†…éƒØä½æē”Øēš„ LLMļ¼ŒęÆč½®ęŽ„ę”¶äø€ē»„å¤±č“„ę”ˆä¾‹ļ¼Œč¾“å‡ŗę”¹čæ›åŽēš„ prompt å€™é€‰ļ¼›äøŽä½  agent ä½æē”Øēš„äøšåŠ” LM ę˜Æäø¤å„—ē‹¬ē«‹é…ē½®ć€‚ + +在 `optimizer.json::optimize.algorithm.reflection_lm` ę®µé…ē½®ļ¼Œē±»åž‹ę˜Æ `OptimizeModelOptions`: + +```json +"reflection_lm": { + "model_name": "gpt-4o", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-...", + "generation_config": {"temperature": 0.6, "max_tokens": 4096} +} +``` + +ęØ”åž‹é€‰åž‹å»ŗč®®č§ [§6.5](#65-reflection-lm-é€‰åž‹å»ŗč®®č”Ø)ļ¼›å®Œę•“å­—ę®µč§ [§7.3.3](#733-optimizealgorithm-ꮵ)怂 + +## 4 ä½ ēš„åœŗę™Æ → ę€Žä¹ˆęŽ„å…„ + +| ä½ ēš„ęƒ…å†µ | ē« čŠ‚ | 对应 example | +|---|---|---| +| agent 是线上 HTTP ęœåŠ”ļ¼ˆFastAPI / Gin / č‡Ŗē ”ęŽ„å£ļ¼‰ | [§4.1](#41) | `http_service` | +| agent ę˜Æå­čæ›ēØ‹ / å‘½ä»¤č”Œå·„å…·ļ¼ˆ`claude` / `codex` / å†…éƒØ CLI) | [§4.2](#42) | `blackbox_cli` | +| agent 是多 sub-agent é“¾č·Æļ¼ˆå¤šäøŖ sub-agent åä½œå®Œęˆäø€ę¬”å“åŗ”ļ¼‰ļ¼Œęƒ³åŒę—¶ä¼˜åŒ–ęÆäøŖ sub-agent ēš„ prompt | [§4.3](#43) | `multi_agent_pipeline` | +| prompt äøåœØęœ¬åœ°ę–‡ä»¶ļ¼Œå­˜åœØčæœē«Æ KV / é…ē½®äø­åæƒ / ę•°ę®åŗ“ / Git ē­‰ä»»ę„åŽē«Æ | [§4.4](#44) | `remote_prompt_store` | +| å•äø€čÆ„ęµ‹ęŒ‡ę ‡äøå¤Ÿē”Øļ¼Œéœ€č¦åŒę—¶č·‘å¤šäøŖčÆ„ęµ‹ęŒ‡ę ‡ļ¼ˆå¦‚ē­”ę”ˆå‡†ē”®ēŽ‡ + å¹»č§‰ēŽ‡ + é£Žę ¼åˆč§„ēŽ‡ļ¼‰å¹¶čžåˆęˆę€»åˆ† | [§4.5](#45) | `multi_metric_with_judges` | +| ęƒ³ęŽ„å…„ CI é—­ēŽÆļ¼šPR ę—¶č·‘čÆ„ęµ‹å®ˆé—Øć€å¤œé—“ēŖ—å£č·‘ä¼˜åŒ–å¹¶č‡ŖåŠØå†™å›žę–° prompt | [§4.6](#46) | `ci_integration` | +| ä¼˜åŒ–ä»»åŠ”ęœ‰ē”¬ēŗ¦ęŸļ¼ˆå¦‚åæ…é”»åœØå‡Œę™Ø 1 å°ę—¶ēŖ—å£å®Œęˆ / ē“Æč®”č°ƒē”Øäøč¶… N ꬔ / čæžē»­ę— ęå‡å°±åœļ¼‰ | [§4.7](#47) | `slo_runtime_control` | +| å·²čƒ½č·‘é€šåŸŗē”€ęµēØ‹ļ¼Œęƒ³čæ›äø€ę­„ęå‡ę•ˆęžœļ¼ˆč°ƒę•“ GEPA 候选选择 / Pareto å‰ę²æ / č·Øå­—ę®µčžåˆļ¼‰ | [§4.8](#48) | `advanced_strategies` | +| å…¶ä»–åøøč§ę‰©å±•ļ¼ˆęŽ„ Grafana / WandB ē­‰ē›‘ęŽ§ć€č‡Ŗå®šä¹‰åœęœŗē­–ē•„ć€ē”Øč‡Ŗå·±ēš„ä¼˜åŒ–ē®—ę³•ļ¼‰ | [§4.9](#49) | (多 example ē»¼åˆļ¼‰ | + +### 4.1 ęˆ‘ēš„ agent 是 HTTP ęœåŠ”ļ¼Œę€Žä¹ˆęŽ„å…„ļ¼Ÿ {#41} + +**ä½ ēš„å¤„å¢ƒ**:业劔 agent å·²ē»ä½œäøŗē‹¬ē«‹ęœåŠ”äøŠēŗæļ¼ˆFastAPI / Gin / č‡Ŗē ”ę”†ęž¶å‡åÆļ¼‰ļ¼ŒåøŒęœ›åÆ¹å®ƒēš„ prompt åšč‡ŖåŠØä¼˜åŒ–ā€”ā€”ä½†ęœåŠ”é•æęœŸčæč”Œäøčƒ½åœć€ęœåŠ”å®žēŽ°ē»†čŠ‚åÆ¹ä¼˜åŒ–å™Øę˜Æé»‘ē›’ć€prompt é€šåøøä»„ę–‡ä»¶å½¢å¼ę³Øå…„ć€‚ + +**ęŽ„å…„ęØ”åž‹**ļ¼šä¼˜åŒ–å™Øä»„**纯客户端**čŗ«ä»½ęŽ„å…„ļ¼ŒäøŽęœåŠ”čæ›ēØ‹**åŖęœ‰äø€äøŖč€¦åˆē‚¹**ā€”ā€”ē£ē›˜äøŠēš„ prompt ꖇ件怂 + +```text ++-------------------+ HTTP request + query +-------------------+ +| AgentOptimizer | ----------------------------------> | HTTP agent | +| (optimizer) | <---------- response -------------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | ęÆę¬”čÆ·ę±‚ + v | ēŽ°čÆ» prompt + +--------------------------------------------------------------+ + | prompt files (on disk) | + +--------------------------------------------------------------+ +``` + +ęœåŠ”čæ›ēØ‹**äøéœ€č¦ä»»ä½•ä»£ē ę”¹åŠØ**ļ¼ŒåŖéœ€č¦ę»”č¶³äø€äøŖēŗ¦å®šļ¼š**ęÆę¬”å¤„ē†čÆ·ę±‚å‰é‡čÆ» prompt ꖇ件**ā€”ā€”čæ™ę ·ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äø‹äø€ę¬”čÆ·ę±‚å°±ē”Ÿę•ˆć€‚ + +**ęŽ„å…„ 3 ę­„**: + +**第 1 歄:在 HTTP ęœåŠ”čÆ»å–ēš„ prompt ę–‡ä»¶äøŠę³Øå†Œ `TargetPrompt`** + +```python +target = TargetPrompt().add_path("system_prompt", "service/prompts/system.md") +``` + +`add_path` ēš„ē¬¬äŗŒäøŖå‚ę•°åæ…é”»ę˜Æ**ęœåŠ”čæ›ēØ‹å®žé™…čÆ»å–ēš„é‚£äøŖę–‡ä»¶č·Æå¾„**ļ¼ˆäøę˜Æä»»ę„å‰Æęœ¬ļ¼‰ļ¼Œå¦åˆ™ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äøä¼šč¢«ęœåŠ”ę„ŸēŸ„ć€‚ + +**第 2 歄:把 `call_agent` å†™ęˆäø€äøŖåÆ¹ęœåŠ”ēš„ HTTP 客户端** + +```python +async def call_agent(query: str) -> str: + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post("http://my-agent-service/chat", + json={"query": query}) + resp.raise_for_status() + return resp.json()["final_text"] +``` + +ęŒ‰äøšåŠ”å®žé™…ęŽ„å£ēš„ payload schema 改 `json=...` å­—ę®µļ¼›ęŒ‰äøšåŠ”é¦–ę¬”ęŽØē†č€—ę—¶č°ƒ `timeout`(example 默认 120s)。 + +**第 3 歄:调 `AgentOptimizer.optimize`** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir=f"runs/{timestamp}", + update_source=False, # 决策蔨见 [§3.4](#34-agentoptimizer) +) +``` + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | čÆ“ę˜Ž | +|---|---| +| ęœåŠ”ęÆę¬”čÆ·ę±‚ę˜Æå¦é‡čÆ» prompt ꖇ件 | 否 → ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰ęœåŠ”ēœ‹äøåˆ°ļ¼Œä¼˜åŒ–ę— ę•ˆć€‚éœ€č¦åœØ handler é‡ŒåŠ é‡čÆ»é€»č¾‘ | +| ä¼˜åŒ–å™Øčæ›ēØ‹åÆ¹ prompt ę–‡ä»¶ęœ‰å†™ęƒé™ | 否 → ä¼˜åŒ–å™Øę— ę³•č½ē›˜ę–°å€™é€‰ | +| ęœåŠ”åÆ¹ prompt ę–‡ä»¶č·Æå¾„äøŽä¼˜åŒ–å™Øēœ‹åˆ°ēš„ę˜Æå¦äø€č‡“ | å®¹å™ØåŒ–éƒØē½²ę—¶å°¤å…¶č¦ē”®č®¤ļ¼ˆmount 路径 / 软链) | +| ęœåŠ” 5xx 蔌为 | ęœåŠ”å†…éƒØäøč¦é™é»˜ retryā€”ā€”ä¼šęŽ©ē›–ēœŸå®žå¤±č“„ēŽ‡ļ¼Œč®©ä¼˜åŒ–å™Øēœ‹åˆ°å‡"高分" | + +**→ å®Œę•“ example**:[`examples/optimization/http_service/`](../../../examples/optimization/http_service/) +- `service/server.py` — 演示 prompt ēƒ­åŠ č½½ēš„ FastAPI ęœåŠ”ļ¼ˆ`/chat` ęÆę¬”é‡å»ŗ agent é‡čÆ» `system.md`ļ¼‰ļ¼ŒåÆä½œäøŗäøšåŠ”ęœåŠ”ę”¹é€ ēš„å‚č€ƒ +- `run_optimization.py` — å®¢ęˆ·ē«Æä¼˜åŒ–å™Øå…„å£ļ¼Œå«åÆåŠØå‰ęœåŠ”å„åŗ·ę£€ęŸ„ļ¼ˆfail-fast) + +### 4.2 ęˆ‘ēš„ agent ę˜Æå¤–éƒØå‘½ä»¤č”Œå·„å…·ļ¼ˆCLIļ¼‰ļ¼Œä¼˜åŒ–å™Øę‹æäøåˆ°å®ƒēš„ä»£ē  {#42} + +**ä½ ēš„å¤„å¢ƒ**:业劔 agent ę˜ÆäøŖå¤–éƒØåÆę‰§č”ŒēØ‹åŗā€”ā€”`claude` / `codex` / 自研 CLI ē­‰ć€‚å®ƒēš„ęŗä»£ē ć€å†…éƒØē”Øēš„ LLM clientć€čæč”Œę—¶čÆ­čØ€åÆ¹ä¼˜åŒ–å™Ø**å®Œå…Øé»‘ē›’**ļ¼Œä½†å®ƒåÆåŠØę—¶ä¼šä»ŽęŸäøŖå·„ä½œē›®å½•čÆ»č‹„å¹² prompt ę–‡ä»¶ļ¼ˆå…øåž‹å¦‚ `CLAUDE.md` + `.claude/skills//SKILL.md`ļ¼‰ć€‚ä½ åøŒęœ›åœØäøę”¹ CLI ä»£ē ć€äøē»‘å®šå®ƒå†…éƒØä»»ä½•ä¾čµ–ēš„å‰ęäø‹ä¼˜åŒ–čæ™äŗ› prompt ꖇ件怂 + +**ęŽ„å…„ęØ”åž‹**ļ¼šä¼˜åŒ–å™Øé€ščæ‡**子进程**č°ƒē”Ø CLIļ¼ŒäøŽ CLI 之闓**å”Æäø€č€¦åˆē‚¹**čæ˜ę˜Æē£ē›˜äøŠēš„ prompt ę–‡ä»¶ā€”ā€”čæ™äø€ē‚¹å’Œ §4.1 ēš„ HTTP ęœåŠ”ē»“ęž„ē›øåŒļ¼Œå·®åˆ«åŖę˜ÆęŠŠ"HTTP 请求"ę¢ęˆ"åÆåŠØäø€äøŖå­čæ›ēØ‹"怂 + +```text ++-------------------+ subprocess + query +-------------------+ +| AgentOptimizer | ------------------------------> | External CLI | +| (optimizer) | <-------- stdout text --------- | (no code change) | ++---------+---------+ +---------+---------+ + | ^ + | write new prompt candidate | ęÆę¬”åÆåŠØ + v | č‡ŖåŠØčÆ»å– + +----------------------------------------------------------+ + | prompt files (on disk) | + +----------------------------------------------------------+ +``` + +CLI äŗŒčæ›åˆ¶ęœ¬čŗ«**äøéœ€č¦ä»»ä½•ę”¹åŠØ**ļ¼ŒåŖéœ€ę»”č¶³ļ¼š**ęÆę¬”åÆåŠØä¼šä»ŽęŒ‡å®šē›®å½•åŠ č½½ prompt ꖇ件**ļ¼ˆē»å¤§å¤šę•° CLI å·„å…·éƒ½ę˜Æčæ™ę ·č®¾č®”ēš„ļ¼‰ć€‚ + +**ęŽ„å…„ 3 ę­„**: + +**第 1 歄:在 CLI čÆ»å–ēš„ prompt ę–‡ä»¶äøŠę³Øå†Œ `TargetPrompt`ļ¼ˆå¤šę–‡ä»¶ē”Øå¤šę¬” `add_path`)** + +```python +target = ( + TargetPrompt() + .add_path("claude_md", "workspace/CLAUDE.md") + .add_path("skill_md", "workspace/.claude/skills/city-info/SKILL.md") +) +``` + +ęÆäøŖ `add_path` ę³Øå†Œäø€äøŖē‹¬ē«‹å­—ę®µļ¼ŒGEPA ęŠŠęÆäøŖå­—ę®µč§†äøŗäø€äøŖē‹¬ē«‹åÆä¼˜åŒ–ęØ”å—ļ¼ŒåÆå•ē‹¬/č”åˆä¼˜åŒ–ļ¼ˆčÆ¦č§ §3.7、§4.3)。 + +**第 2 歄:把 subprocess č°ƒē”Ø + stdout č§„čŒƒåŒ–åŒ…ęˆ `call_agent`** + +```python +async def call_agent(query: str) -> str: + proc = await asyncio.create_subprocess_exec( + "trpc-claudecode", "--print", + "--add-dir", str(WORKSPACE_DIR), # CLI ä»Žčæ™é‡ŒåŠ č½½ prompt ꖇ件 + "--dangerously-skip-permissions", + query, # query 作 argv ē›“ä¼ ļ¼Œéæå… shell 转义 + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_build_cli_env(), # äøšåŠ”č‡Ŗęœ‰ CLI ęœŸęœ›ēš„ēŽÆå¢ƒå˜é‡ + ) + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), timeout=90.0, # é˜²ę­¢å•ę¬” CLI å”ę­» + ) + if proc.returncode != 0: + raise RuntimeError(f"CLI exited {proc.returncode}: {stderr_b[:400]!r}") + return _normalize_response(stdout_b.decode("utf-8", "replace")) +``` + +`call_agent` ä»ē„¶ę˜Æ §3.1 é‚£äøŖę ‡å‡†ē­¾å `async (query: str) -> str`ļ¼ŒåÆ¹ä¼˜åŒ–å™Øäø»å¾ŖēŽÆę„čÆ“ļ¼Œčæ™äø€ä»½ `call_agent` 和"调本地 LLM"ę˜Æę— å·®åˆ«ēš„ć€‚`_build_cli_env` / `_normalize_response` ę˜ÆäøšåŠ”ęŒ‰č‡Ŗå·± CLI ēš„ē‰¹ę€§č‡Ŗå·±å®žēŽ°ēš„č¾…åŠ©å‡½ę•°ļ¼ˆå‰č€…ęŠŠēŽÆå¢ƒå˜é‡ę”¹å†™/蔄齐成 CLI ęœŸęœ›ēš„å½¢ę€ć€åŽč€…ęŠŠ CLI stdout č§„ę•“ęˆčÆ„ęµ‹åÆęÆ”ēš„ēØ³å®šå­—ē¬¦äø²ļ¼‰ā€”ā€”ęœ¬ę”†ęž¶äøč§„å®šå®ƒä»¬ēš„å½¢ę€ļ¼ŒęŒ‰éœ€å®žēŽ°å³åÆć€‚ + +**第 3 ę­„ļ¼šč·‘äø€ę¬”ē”®č®¤ baseline é€šē•…ļ¼Œå†äŗ¤ē»™ GEPA åę€ä¼˜åŒ–** + +```python +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, +) +``` + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| CLI ę˜Æå¦ęÆę¬”åÆåŠØéƒ½é‡čÆ» prompt ꖇ件 | 否 → ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äøä¼šē”Ÿę•ˆļ¼›å€™é€‰é—“čÆ„ä¼°ē­‰åŒäŗŽč·‘åŒäø€ä»½ baseline | +| CLI ę˜Æå¦ę”ÆęŒęŠŠ query é€ščæ‡ argv / stdin / `--query xxx` ä¼ å…„ | 否 → ęŽ„å…„äøåÆč”Œļ¼ˆéœ€č¦å…ˆē»™ CLI åŠ čæ™äøŖå…„å£ļ¼‰ | +| CLI å¹³å‡å•ę¬”č€—ę—¶ę˜Æå¦å·²ēŸ„ | 否 → ę— ę³•åˆē†č®¾ē½® `CLI_TIMEOUT_SEC` äøŽ `max_metric_calls` | +| CLI čæ›ēØ‹ę˜Æå¦ä¼šę±”ęŸ“å…±äŗ«ē£ē›˜ēŠ¶ę€ļ¼ˆé™¤ prompt 文件外) | 是 → čÆ„ęµ‹äøåÆé‡å¤ļ¼›éœ€č¦ `eval_case_parallelism=1` ęˆ–äøŗęÆäøŖ case 起独立 workspace | + +**→ å®Œę•“ example**:[`examples/optimization/blackbox_cli/`](../../../examples/optimization/blackbox_cli/) +- `agent/call_agent.py` — subprocess č°ƒē”Ø + ēŽÆå¢ƒå˜é‡é€‚é… + stdout č§„čŒƒåŒ–ēš„å·„ēØ‹å®žēŽ°ļ¼ŒåÆä½œäøŗęŽ„å…„č‡Ŗęœ‰ CLI ēš„ę”¹é€ čµ·ē‚¹ +- `run_optimization.py` — åŒå­—ę®µļ¼ˆ`CLAUDE.md` + `SKILL.md`)`TargetPrompt` ēš„ę ‡å‡†å…„å£ + +### 4.3 ęˆ‘ēš„ agent 是多 sub-agent é“¾č·Æļ¼Œęƒ³åŒę—¶ä¼˜åŒ–ęÆäøŖ sub-agent ēš„ prompt {#43} + +**ä½ ēš„å¤„å¢ƒ**ļ¼šäøšåŠ”ä¾§å·²ē»ē¼–ęŽ’å„½å¤š sub-agent åä½œé“¾č·Æć€‚ęÆäøŖ sub-agent ęœ‰č‡Ŗå·±ēš„ system promptļ¼Œå­—ę®µé—“čæ˜å­˜åœØéšå¼å„‘ēŗ¦ļ¼ˆäøŠęøø sub-agent ēš„č¾“å‡ŗå½¢ę€åæ…é”»åŒ¹é…äø‹ęøøęœŸęœ›ļ¼‰ć€‚ę‰‹å·„čæ­ä»£ę—¶åøøč§ē—‡ēŠ¶ę˜Æ**"改 A č§ę•ˆļ¼Œä½†ę‹–ē“Æ B"**ć€‚ä½ åøŒęœ›åÆ¹ę‰€ęœ‰ sub-agent ēš„ prompt **č”åˆä¼˜åŒ–**ļ¼Œč®©ē«Æåˆ°ē«ÆęŒ‡ę ‡äøŠåˆ†ć€‚ + +**ęŽ„å…„ęØ”åž‹**ļ¼šęŠŠęÆäøŖ sub-agent ēš„ prompt ę³Øå†Œęˆ `TargetPrompt` ēš„äø€äøŖ**独立字段**——GEPA ęŠŠęÆäøŖå­—ę®µč§†äøŗäø€äøŖē‹¬ē«‹åÆä¼˜åŒ–ęØ”å—ļ¼ˆcomponentļ¼‰ļ¼ŒęÆč½®ęŒ‰ `module_selector` 选 1 äøŖęˆ–å¤šäøŖå­—ę®µå†™å›žļ¼Œä¼˜åŒ–å™ØåŖēœ‹ē«Æåˆ°ē«Æ metric åˆ†ę•°ä½œäøŗåé¦ˆć€‚é“¾č·Æä»£ē **å®Œå…Øé›¶äæ®ę”¹**ļ¼ŒęÆäøŖ sub-agent åœØęÆę¬”č¢«č°ƒē”Øę—¶é‡čÆ»č‡Ŗå·±ēš„ prompt ę–‡ä»¶å³åÆć€‚ + +```text ++-----------------------------+ round-robin fields +---------------------+ +| AgentOptimizer | ---------------------> | prompt files | +| (multi-field TargetPrompt) | write new candidate | (one per agent) | +| | | | ++--------------+--------------+ +----------+----------+ + ^ | + | end-to-end metric score | ęÆę¬”č°ƒē”Ø + | | ēŽ°čÆ» prompt + | v + | +-----------------------------------------+ + +------------- | call_agent(query) | + | = multi sub-agent pipeline entry | + | (sub-agent A -> sub-agent B -> ...) | + +-----------------------------------------+ +``` + +**ęŽ„å…„ 3 ę­„**: + +**第 1 ę­„ļ¼šęŠŠęÆäøŖ sub-agent ēš„ prompt ę–‡ä»¶ę³Øå†Œäøŗē‹¬ē«‹å­—ę®µ** + +```python +target = ( + TargetPrompt() + .add_path("agent_a", ".md") + .add_path("agent_b", ".md") + # ... 一个 sub-agent 一个 add_path +) +``` + +key ę˜ÆčÆ„å­—ę®µåœØåę€ prompt / äŗ§ē‰©ę–‡ä»¶åäø­ēš„ę ‡čÆ†ļ¼ŒäøšåŠ”åÆčÆ»å³åÆć€‚ + +**第 2 ę­„ļ¼šęŠŠę•“ę”é“¾č·Æč°ƒē”ØåŒ…ęˆ `call_agent`ļ¼Œå¹¶äæčÆ sub-agent ęÆę¬”ēŽ°čÆ» prompt** + +```python +async def call_agent(query: str) -> str: + return await invoke_pipeline(query) # ä½ å·²ęœ‰ēš„é“¾č·Æå…„å£ +``` + +`invoke_pipeline` å†…éƒØēš„å…³é”®ēŗ¦ęŸļ¼š**ęÆäøŖ sub-agent åœØęÆę¬”č¢«č°ƒē”Øę—¶åæ…é”»é‡čÆ»č‡Ŗå·±ēš„ prompt ꖇ件**ļ¼Œå¦åˆ™ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äøä¼šē”Ÿę•ˆć€‚ + +**第 3 歄:在 `optimizer.json` ę‰“å¼€å¤šå­—ę®µē›øå…³ēš„å¼€å…³** + +```jsonc +{ + "optimize": { + "algorithm": { + "module_selector": "round_robin", // ęÆč½®é€‰ 1 äøŖå­—ę®µč½®ę¢ę”¹å†™ļ¼Œä¾æäŗŽå½’å›  + "use_merge": true, // ē“Æē§Æč‹„å¹²å•å­—ę®µę”¹čæ›åŽäø»åŠØčžåˆ + "max_merge_invocations": 3, + "reflection_history_top_k": 3 // å¤šå­—ę®µč½®ę¢ę—¶å»ŗč®®č°ƒå¤§ļ¼ˆé»˜č®¤ 2) + } + } +} +``` + +å„å‚ę•°å®Œę•“čÆ­ä¹‰äøŽå–å€¼åÆ¹ē…§č§ [§7 å®Œę•“ API å‚č€ƒ](#7-å®Œę•“-api-å‚č€ƒ)怂 + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| ęÆäøŖ sub-agent ę˜Æå¦ęÆę¬”č¢«č°ƒē”Øéƒ½é‡čÆ»č‡Ŗå·±ēš„ prompt ꖇ件 | 否 → ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äøä¼šē”Ÿę•ˆļ¼›å€™é€‰é—“čÆ„ä¼°ē­‰åŒäŗŽč·‘åŒäø€ä»½ baseline | +| 端到端 metric ę˜Æå¦čƒ½åę˜ å„å­—ę®µč”åˆč“Øé‡ | 否 → åę€ LM ę‹æåˆ°ēš„åé¦ˆäæ”å·äøēœŸå®žļ¼›å»ŗč®®ē”Ø `final_response_avg_score` čÆ„ęœ€ē»ˆē­”å¤ | +| 单 case ē»čæ‡å‡ ę¬” LLM ęŽØē† | č°ƒē”Øé‡ęŒ‰é“¾č·Æę·±åŗ¦å€å¢žļ¼Œéœ€ē›øåŗ”č°ƒå° `eval_case_parallelism` / `reflection_minibatch_size` 防 rate limit | +| sub-agent ę˜Æå¦éœ€č¦åœØåŒäø€čæ›ēØ‹ | äøåæ…ā€”ā€”`call_agent` å†…éƒØåÆä»„ę˜Æ HTTP / gRPC / å†…éƒØ SDK / å…¶ä»–ē¼–ęŽ’ę”†ęž¶ļ¼›åŖč¦ęœ€ē»ˆčæ”å›ž `str` å³åÆ | + +**→ å®Œę•“ example**:[`examples/optimization/multi_agent_pipeline/`](../../../examples/optimization/multi_agent_pipeline/) +- `pipeline/orchestrator.py` — 多 sub-agent é“¾č·Æå®žēŽ°ļ¼Œsub-agent åœØęÆę¬”č°ƒē”Øę—¶é‡čÆ» prompt +- `run_optimization.py` — å¤šå­—ę®µ `TargetPrompt` ēš„ę ‡å‡†å…„å£ +- `optimizer.json` — å¤šå­—ę®µåœŗę™Æēš„ęŽØčé…ē½® + +### 4.4 ęˆ‘ēš„ prompt äøåœØęœ¬åœ°ę–‡ä»¶ļ¼Œå­˜åœØčæœē«Æé…ē½®äø­åæƒ / KV / ę•°ę®åŗ“ {#44} + +**ä½ ēš„å¤„å¢ƒ**:业劔 prompt äøåœØęœ¬åœ°ę–‡ä»¶ļ¼Œč€Œę˜Æę”¾åœØčæœē«Æé…ē½®äø­åæƒļ¼ˆäøƒå½©ēŸ³ / Apollo / Nacos / 自研 KV / ę•°ę®åŗ“ / Git ē­‰ļ¼‰ļ¼ŒäøšåŠ”ä»Žäø­åæƒę‹‰å–ä½æē”Øć€‚ä¼˜åŒ–å™Øę— ę³•ē›“ęŽ„čµ°ę–‡ä»¶ē³»ē»Ÿā€”ā€”åŖčƒ½é€ščæ‡äøšåŠ”č‡Ŗęœ‰ SDK äøŽčæœē«Æäŗ¤äŗ’ć€‚ + +**ęŽ„å…„ęØ”åž‹**:`TargetPrompt` 把"prompt åœØå“Ŗé‡Œ"ęŠ½č±”ęˆäø€åÆ¹ async 函数 `read` / `write`ā€”ā€”ä¼˜åŒ–å™Øč°ƒ `read` 拿 baseline åæ«ē…§ć€č°ƒ `write` č½å€™é€‰ļ¼Œčæœē«ÆåŽē«Æå½¢ę€ļ¼ˆKV / RPC / SQL / Git API ...ļ¼‰åÆ¹ä¼˜åŒ–å™Ø**å®Œå…Øé»‘ē›’**ć€‚čæ™äøŽ §4.1 / §4.2 é€ščæ‡ęœ¬åœ° prompt ę–‡ä»¶č€¦åˆēš„ē»“ęž„åŒęž„ļ¼Œå·®åˆ«åŖę˜ÆęŠŠ"读写文件"ę¢ęˆ"č°ƒē”ØäøšåŠ”ē»™ēš„äø¤äøŖ async 函数"怂 + +```text ++-------------------+ async read / write +---------------------+ +| AgentOptimizer | <--------------------------------> | Remote Config | +| (optimizer) | (your SDK / HTTP / RPC) | (KV / DB / Git ...)| ++---------+---------+ +---------+-----------+ + ^ | + | best_prompts/ saved locally | äøšåŠ”ęÆę¬”č°ƒē”Ø + | | ēŽ°ę‹‰é…ē½® + v v + +-------------------+ +---------------------------+ + | output_dir/ | | inside call_agent | + | best_prompts/ | | pull latest prompt & run | + +-------------------+ +---------------------------+ +``` + +**ęŽ„å…„ 3 ę­„**: + +**第 1 ę­„ļ¼šå®žēŽ°äø€åÆ¹ę“ä½œčæœē«Æ prompt ēš„ async 函数** + +```python +async def read_prompt() -> str: + return await your_config_sdk.get(key="system_prompt") + +async def write_prompt(value: str) -> None: + await your_config_sdk.put(key="system_prompt", value=value) +``` + +ē­¾åēŗ¦ęŸļ¼š`read: async () -> str`态`write: async (str) -> None`ć€‚é‡čÆ• / 幂等性 / é‰“ęƒē”±äøšåŠ”č‡Ŗęœ‰ SDK äæčÆć€‚ + +**第 2 ę­„ļ¼šē”Ø `add_callback` 而非 `add_path` ę³Øå†Œ `TargetPrompt`** + +```python +target = TargetPrompt().add_callback( + "system_prompt", + read=read_prompt, + write=write_prompt, +) +``` + +`add_callback` äøŽ `add_path` 在 `TargetPrompt` äøŠåÆ¹ē­‰å¹¶å­˜ā€”ā€”å¤šå­—ę®µä¹ŸåÆä»„ę··ē”Øļ¼ˆéƒØåˆ†å­—ę®µåœØęœ¬åœ°ę–‡ä»¶ć€éƒØåˆ†å­—ę®µåœØčæœē«Æé…ē½®äø­åæƒļ¼‰ć€‚ + +**第 3 歄:把 `call_agent` å†™ęˆ"ēŽ°ę‹‰ēŽ°ē”Ø"ļ¼Œē…§åøøč°ƒ `optimize`** + +```python +async def call_agent(query: str) -> str: + prompt_text = await read_prompt() # ēŽ°ę‹‰ļ¼ŒäæčÆå€™é€‰å†™å…„ē«‹å³ē”Ÿę•ˆ + agent = create_agent(prompt_text) + return await runner.run_async(query, ...) + +await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=call_agent, + target_prompt=target, + train_dataset_path="train.evalset.json", + validation_dataset_path="val.evalset.json", + output_dir="runs//", + update_source=False, # 决策蔨见 §3.4 +) +``` + +`update_source` å–å€¼ē”±äøšåŠ”ä¾§ prompt å†™å›žē­–ē•„å†³å®šļ¼ˆčÆ¦č§ §3.4 å†³ē­–č”Øļ¼‰ļ¼Œę”†ęž¶åÆ¹å®ƒę²”ęœ‰é¢å¤–é™åˆ¶ć€‚ + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| äøšåŠ”ä¾§ęÆę¬”č°ƒē”Øę˜Æå¦é‡ę–°ę‹‰é…ē½® | 否 → ä¼˜åŒ–å™Øå†™å…„ę–°å€™é€‰åŽäøšåŠ”ę„ŸēŸ„äøåˆ°ļ¼Œåę€å¾ŖēŽÆå¤±ę•ˆ | +| `read` / `write` 是否都是 async 函数 | 否 → `add_callback` ę³Øå†Œę—¶å³ęŠ„é”™ | +| `write` ę˜Æå¦å¹‚ē­‰ļ¼ˆęŽ„å—é‡å¤å†™åŒäø€ value) | 否 → ę”¶å°¾č‡ŖåŠØå›žę»šåˆ° baseline ę—¶åÆčƒ½å¤±č“„ļ¼Œé—ē•™čæœē«Æč¢«ę±”ęŸ“ | +| ä¼˜åŒ–å™Øčæ›ēØ‹ę˜Æå¦åÆ¹čÆ„ key / namespace ęœ‰å†™ęƒé™ | 否 → `write` ęŠ›ęƒé™é”™čÆÆļ¼Œå½“å‰å€™é€‰čÆ„ä¼°å¤±č“„ | + +> **ę¶‰åŠē”Ÿäŗ§ prompt ēš„å®‰å…ØęØ”å¼**ļ¼ˆęŒ‰éœ€é‡‡ē”Øļ¼Œéžę”†ęž¶å¼ŗåˆ¶ļ¼‰ļ¼šäøšåŠ”ä¾§č‹„å·²ęœ‰ sandbox / production namespace éš”ē¦»ļ¼ŒåÆč®©ä¼˜åŒ–å™ØåŖčÆ»å†™ sandbox keyļ¼Œé…åˆ `update_source=False` č®©ä¼˜åŒ–å™Øę”¶å°¾č‡ŖåŠØå›žę»š sandboxļ¼Œęœ€ä½³å€™é€‰ä»…č½ęœ¬åœ° `best_prompts/`ļ¼Œå†ē”±äøšåŠ”č‡Ŗęœ‰å®”ę‰¹ęµåŒę­„åˆ° production怂`examples/optimization/remote_prompt_store/` ę¼”ē¤ŗēš„å°±ę˜Æčæ™ē§å·„ä½œęµć€‚ + +**→ å®Œę•“ example**:[`examples/optimization/remote_prompt_store/`](../../../examples/optimization/remote_prompt_store/) +- `store/prompt_client.py` — `read` / `write` async å‡½ę•°å®šä¹‰ļ¼Œę˜ÆęŽ„å…„äøšåŠ”é…ē½®äø­åæƒ SDK ēš„ę øåæƒę”¹é€ ē‚¹ +- `run_optimization.py` — `add_callback` ę³Øå†Œēš„ę ‡å‡†å…„å£ļ¼ˆę¼”ē¤ŗé‡‡ē”Ø sandbox + `update_source=False` + äŗŗå·„å®”ę‰¹ēš„å®‰å…Øå·„ä½œęµļ¼‰ + +### 4.5 å•äø€čÆ„ęµ‹ęŒ‡ę ‡äøå¤Ÿē”Øļ¼Œéœ€č¦å¤šäøŖęŒ‡ę ‡å¹¶čžåˆęˆę€»åˆ† {#45} + +**ä½ ēš„å¤„å¢ƒ**:业劔上线对 agent č¾“å‡ŗēš„č¦ę±‚å¾€å¾€äøę­¢äø€äøŖē»“åŗ¦ā€”ā€”ē­”ę”ˆå¾—åÆ¹ļ¼ˆę­£ē”®ę€§ē”¬ēŗ¦ęŸļ¼‰+ äøčƒ½ä¹±čÆ“ļ¼ˆå¹»č§‰ēŽ‡ļ¼‰+ é£Žę ¼ē¬¦åˆč§„čŒƒļ¼ˆę ¼å¼ / 语气)+ äøåø¦ę•ę„ŸčÆļ¼ˆåˆč§„ļ¼‰ā€¦ā€¦å•äø€ metric č£…äøäø‹ļ¼Œå¼ŗč”Œē”Øå•äøŖē»¼åˆ metric ēš„čÆļ¼Œåę€ LM ēœ‹åˆ°ēš„åé¦ˆäæ”å·ę˜Æę··åˆåŽēš„ę ‡é‡ļ¼Œå¾ˆéš¾å®šå‘å½’å› ć€‚ + +**ęŽ„å…„ęØ”åž‹**:`optimizer.json` ēš„ `evaluate.metrics` 是**åˆ—č”Ø**ā€”ā€”ē›“ęŽ„åˆ—å¤šę” metricļ¼ŒęÆę”ē‹¬ē«‹ę‰“åˆ†ć€ē‹¬ē«‹ thresholdć€ē‹¬ē«‹é…ē½®ć€‚ę—©åœåˆ¤å®šé€ščæ‡ `optimize.stop.required_metrics` å£°ę˜Žå“Ŗäŗ› metric 必锻达标;GEPA å†…éƒØé€ščæ‡ `optimize.algorithm.frontier_type` å†³å®šå¦‚ä½•åœØå¤š metric é—“ē»“ęŠ¤ Pareto å‰ę²æéæå…"改 A 拖瓯 B"ć€‚ę•“äøŖęœŗåˆ¶ēŗÆé…ē½®é©±åŠØā€”ā€”`call_agent` äøŽ `TargetPrompt` éƒ½äøéœ€č¦äøŗå¤š metric ę”¹äø€č”Œä»£ē ć€‚ + +**é…ē½® 3 ę­„**: + +**第 1 歄:在 `evaluate.metrics` åˆ—å‡ŗę‰€ęœ‰ metric** + +```jsonc +{ + "evaluate": { + "num_runs": 2, // 平滑 LLM č¾“å‡ŗę–¹å·®ļ¼ˆ>1 č®©ęÆę” case č·‘å¤šę¬”å–å‡å€¼ļ¼‰ + "metrics": [ + { + "metric_name": "llm_final_response", // ē”¬ēŗ¦ęŸļ¼šē­”ę”ˆę˜Æå¦äøŽ reference å®žč“Øē­‰ä»· + "threshold": 1.0, + "criterion": { "...": "..." } // å®Œę•“å­—ę®µč§ §7 / example + }, + { + "metric_name": "llm_rubric_response", // č½Æēŗ¦ęŸļ¼šå¤š rubricļ¼ˆę ¼å¼ / é£Žę ¼ / 单位 ...) + "threshold": 0.75, + "criterion": { "...": "..." } + } + ] + } +} +``` + +ęÆę” metric ē‹¬ē«‹ę‰“åˆ†ē‹¬ē«‹å†™å…„ `result.json` ēš„ `metric_breakdown`ļ¼Œä¾æäŗŽåå‘å½’å› ęŸę¬”čÆ„ęµ‹åœØå“Ŗę” metric äøŠęŽ‰åˆ†ć€‚ + +**第 2 歄:在 `optimize.stop.required_metrics` å£°ę˜Žę—©åœé—Øē¦** + +| 取值 | 语义 | é€‚ē”Øåœŗę™Æ | +| --- | --- | --- | +| `"all"` | ꉀ꜉ metric 都达 threshold ę‰ę—©åœ | ꉀ꜉ metric éƒ½ę˜Æåæ…é”»č¾¾ę ‡é”¹ | +| `["m1", "m2"]` | åˆ—č”Øäø­ę‰€ęœ‰ metric č¾¾ threshold ę‰ę—©åœļ¼ˆå…¶ä»– metric ä»å‚äøŽčÆ„ęµ‹ä½†äøå½±å“ę—©åœļ¼‰ | éƒØåˆ† metric ę˜Æå‚č€ƒč§‚ęµ‹é”¹ć€äøä½œäøŗé—Øē¦ | +| `null` ꈖ `[]` | äøå‚äøŽę—©åœļ¼Œä»…é ē®—ę³•å±‚ budget / no-improvement / score_threshold ęŽ§åˆ¶ | åŖęƒ³č·‘ę»”é¢„ē®—ēœ‹ē»“ęžœ | + +**第 3 歄:把 `frontier_type` č°ƒåˆ°čƒ½ę­£ē”®å¤„ē†å¤š metric ēš„å–å€¼** + +| 取值 | 含义 | 适用 | +| --- | --- | --- | +| `instance` | ęÆäøŖ case ē»“ęŠ¤äø€äøŖ best 候选 | 单 metric ꈖ metric é—“ę— ę˜Žę˜¾å†²ēŖ | +| `objective` | ęÆäøŖ metric ē»“ęŠ¤äø€äøŖ best 候选 | 多 metric 但 case é‡č¾ƒå° | +| `hybrid` | åŒę—¶ē»“ęŠ¤ case + metric åŒå±‚å‰ę²æ | **多 metric ēœŸå†²ēŖåœŗę™Æ**ļ¼ˆęŽØčé»˜č®¤ļ¼‰ | +| `cartesian` | ęÆäøŖ (case, metric) ē»„åˆäø€äøŖ best | ęžå¤ę‚ / č°ƒčÆ•ē”Øļ¼Œå€™é€‰ę± å®¹ę˜“ēˆ†ē‚ø | + +`hybrid` 让 GEPA åœØę”¹čæ›äø€äøŖ metric ę—¶äøäø¢å¤±å¦äø€äøŖ metric äøŠēš„ęœ€ä½³å€™é€‰ā€”ā€”**多 metric äøšåŠ”ēš„å®‰å…Øé»˜č®¤**ć€‚å„å–å€¼å®Œę•“å®šä¹‰č§ [§7](#7-å®Œę•“-api-å‚č€ƒ)怂 + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| 各 metric ēš„ `threshold` ę˜Æå¦ē¬¦åˆäøšåŠ”čÆ‰ę±‚ | 否 → ę—©åœåˆ¤å®šäøå‡†ļ¼›ä¼˜åŒ–ē»“ęŸę—¶äøšåŠ”å…³é”®ęŒ‡ę ‡åÆčƒ½ęœŖč¾¾ę ‡ | +| ę˜Æå¦åŖęœ‰"ē”¬ēŗ¦ęŸ"č¢«åˆ—å…„ `stop.required_metrics` | 否 → č½Æēŗ¦ęŸę³¢åŠØä¼šåå¤ę‰“ę–­ę—©åœåˆ¤å®šļ¼ŒęµŖč“¹é¢„ē®— | +| `eval_case_parallelism` ę˜Æå¦č€ƒč™‘äŗ† metric ꕰ Ɨ judge ę•°ēš„å¹¶å‘é‡ | 否 → å•č½® LLM č°ƒē”Øé‡ēˆ†ē‚øļ¼ˆN case Ɨ M metric Ɨ K judge Ɨ `num_runs`ļ¼‰ļ¼Œå®¹ę˜“ę’ž LLM åŽē«Æ rate limit | +| `num_runs` ę˜Æå¦åˆē†ļ¼ˆé»˜č®¤ 1) | 单 LLM judge č¾“å‡ŗå­˜åœØę–¹å·®ļ¼›å»ŗč®® `num_runs=2` č®©ęÆę” case č·‘äø¤ę¬”å–å‡å€¼ę¶ˆé™¤ęŠ–åŠØ | + +**→ å®Œę•“ example**:[`examples/optimization/multi_metric_with_judges/`](../../../examples/optimization/multi_metric_with_judges/) +- `optimizer.json` — `llm_final_response`(多 judge `all_pass` ęŠ•ē„Øļ¼‰+ `llm_rubric_response`ļ¼ˆå• judge 多 rubric)+ `frontier_type=hybrid` + `stop.required_metrics` åˆ—č”Øå¼ēš„å®Œę•“é…ē½®čŒƒä¾‹ +- `run_optimization.py` — äøŽå• metric åœŗę™Æäø€č‡“ēš„ę ‡å‡†å…„å£ļ¼ˆå¤š metric äøå½±å“å…„å£ä»£ē ļ¼‰ + +### 4.6 ęƒ³ęŽ„å…„ CI é—­ēŽÆļ¼šPR å®ˆé—Ø + å¤œé—“ä¼˜åŒ–č‡ŖåŠØå†™å›ž {#46} + +**ä½ ēš„å¤„å¢ƒ**ļ¼šä½ åøŒęœ› prompt å·„ēØ‹ä¹Ÿčµ° CI/CD ęµēØ‹ā€”ā€”ęÆę¬” PR č‡ŖåŠØč·‘čÆ„ęµ‹å®ˆé—Øļ¼ˆåˆ†ę•°ä½ŽäŗŽé˜ˆå€¼å³ CI ēŗ¢ēÆļ¼Œé˜»ę­¢åŠ£åŒ– prompt čæ›äø»å¹²ļ¼‰ļ¼ŒåŒę—¶åœØä½Žå³°ēŖ—å£č‡ŖåŠØč·‘åę€ä¼˜åŒ–ęŠŠę›“ä¼˜ prompt å†™å›žęŗę–‡ä»¶ļ¼Œäø‹äø€ę¬” PR č‡ŖåŠØē”ØäøŠć€‚**å•ē‹¬ä½æē”Øä»»äø€é“¾č·Æéƒ½äøå¤Ÿ**ļ¼šēŗÆå®ˆé—Øäøä¼šč®© prompt č‡ŖåŠØå˜å„½ļ¼ŒēŗÆä¼˜åŒ–ę²”ęœ‰č“Øé‡é—Øē¦ć€‚ + +**ęŽ„å…„ęØ”åž‹**:`AgentEvaluator.evaluate`(pytest č·‘ PR å®ˆé—Øļ¼‰äøŽ `AgentOptimizer.optimize`ļ¼ˆå¤œé—“ä¼˜åŒ–ļ¼‰å…±äŗ«**åŒäø€ä»½čµ„äŗ§**ā€”ā€”åŒäø€äøŖ `call_agent`ć€åŒäø€ä»½ evalsetļ¼ˆē‰©ē†äøŠę‹† train / val äø¤ę–‡ä»¶é˜²ę³„ę¼ļ¼Œé€»č¾‘äøŠäø€å„—čÆ­ę–™ļ¼‰ć€åŒäø€åÆ¹ prompt ꖇ件怂`update_source=True` ę˜Æé—­ēŽÆēš„å…³é”®å¼€å…³ļ¼šä¼˜åŒ–ęˆåŠŸļ¼ˆ`OptimizeResult.status=SUCCEEDED`ļ¼‰åŽęœ€ä¼˜å€™é€‰ē›“ęŽ„č¦†ē›–ęŗ prompt ę–‡ä»¶ļ¼Œäø‹äø€ę¬” PR č§¦å‘ēš„ pytest č‡ŖåŠØčÆ»å–ę–°å†…å®¹ć€‚ + +```text + +-----------------------------------------------------+ + | Shared: call_agent + evalset + prompt files | + +------+----------------------------------------+-----+ + | | + Trigger: PR | | Trigger: Night + v v + +---------------------------+ +---------------------------+ + | AgentEvaluator.evaluate | | AgentOptimizer.optimize | + | (pytest) | | update_source=True | + | | | | + | Score < threshold -> Red | | OK -> overwrite prompt | + | pytest exit != 0 -> Block| | Fail -> keep unchanged | + +---------------------------+ +-------------+-------------+ + | + v + 下一欔 PR č‡ŖåŠØē”Øę–° prompt + (形成 eval->optimize->eval é—­ēŽÆ) +``` + +**ęŽ„å…„ 3 ę­„**: + +**第 1 歄:把 `call_agent` 抽到 evaluate / optimize å…±äŗ«ēš„ęØ”å—é‡Œ** + +```python +# agent/agent.py(pytest äøŽ optimizer éƒ½ä»Žčæ™é‡Œ import) +async def call_agent(query: str) -> str: + ... +``` + +**äøŗä»€ä¹ˆåæ…é”»å…±äŗ«**ļ¼ščÆ„ęµ‹ę—¶ä½æē”Øēš„ agent å’Œä¼˜åŒ–ę—¶ä½æē”Øēš„ agent åæ…é”»**等价**ā€”ā€”å¦åˆ™ä¼šå‡ŗēŽ°"ä¼˜åŒ–å™Øę‰¾åˆ°äŗ† evaluator éŖŒčÆäøäŗ†ēš„å„½ prompt"ęˆ–åå‘é—®é¢˜ć€‚å…±äŗ«åŒäø€äøŖ `call_agent` ę–‡ä»¶ę˜Æęœ€ē›“ęŽ„ēš„ä»£ē ēŗ§äæčÆć€‚ä»»ä½• agent ę”¹åŠØļ¼ˆęØ”åž‹åˆ‡ę¢ / temperature č°ƒę•“ / output schema å˜åŒ–ļ¼‰åŖéœ€ę”¹äø€å¤„ć€‚ + +**第 2 ę­„ļ¼šå†™ PR å®ˆé—Øēš„ pytest 兄口** + +```python +# tests/test_agent_quality.py +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator +from agent.agent import call_agent + +@pytest.mark.asyncio +async def test_agent_quality(): + await AgentEvaluator.evaluate( + call_agent=call_agent, + eval_set_path="data/val.evalset.json", + test_config_path="optimizer.json", # å¤ē”ØåŒäø€ä»½ metric é…ē½® + ... + ) # åˆ†ę•°ä½ŽäŗŽ threshold ꗶꔆꞶꊛ AssertionError → pytest ēŗ¢ +``` + +CI ęµę°“ēŗæé‡Œč·‘ļ¼š + +```bash +pytest tests/ --junitxml=runs/pytest_report.xml +``` + +`--junitxml` č¾“å‡ŗę ‡å‡†ę ¼å¼ēš„ęµ‹čÆ•ęŠ„å‘Šļ¼ŒGitHub Actions / č“ē›¾ęµę°“ēŗæ / Tencent CI ē­‰äø»ęµå¹³å°å‡åŽŸē”Ÿč§£ęžć€‚å¤±č“„ę—¶ `AssertionError` ę¶ˆęÆé‡ŒåŒ…å«ęÆę” case ēš„å¤±č“„ę˜Žē»† JSON,CI å¹³å°å±•ē¤ŗ stack trace ę—¶åÆē›“ęŽ„ēœ‹åˆ°å…·ä½“å“Ŗę” case 失蓄、agent å®žé™…č¾“å‡ŗę˜Æä»€ä¹ˆć€äøŽ expected ēš„å·®å¼‚åœØå“Ŗć€‚ + +**第 3 ę­„ļ¼šå¤œé—“ēŖ—å£č·‘ä¼˜åŒ– + `update_source=True`** + +```python +# run_optimization.pyļ¼ˆå¤œé—“ cron č§¦å‘ļ¼‰ +await AgentOptimizer.optimize( + config_path="optimizer.json", # äøŽ pytest 共用 metric é…ē½® + call_agent=call_agent, # äøŽ pytest 共用 call_agent + target_prompt=target, + train_dataset_path="data/train.evalset.json", + validation_dataset_path="data/val.evalset.json", + output_dir="runs/optimize_/", + update_source=True, # CI é—­ēŽÆēš„å…³é”®å¼€å…³ +) +``` + +`update_source=True` ēš„å®‰å…ØäæčÆļ¼šä»… `OptimizeResult.status=SUCCEEDED` ę—¶ę‰ä¼šå†™å›žļ¼›å¤±č“„ / é¢„ē®—č€—å°½ē­‰å…¶ä»–ēŠ¶ę€äø‹ęŗę–‡ä»¶äæęŒäøå˜ć€‚č¦†ē›–é‡‡ē”ØåŽŸå­å†™ļ¼ˆtmp + `os.replace`ļ¼‰ļ¼Œäø­é€”å¼‚åøø / SIGINT äøä¼šęŸåęŗ prompt ę–‡ä»¶ļ¼ˆčÆ¦č§ [§8.3](#83-åŽŸå­č½ē›˜äæčÆ))。 + +å¤œé—“č„šęœ¬ęœ«å°¾å»ŗč®®åŠ  `git diff --quiet agent/prompts/` åˆ¤ę–­ę˜Æå¦ęœ‰ę”¹åŠØļ¼Œę— ę”¹åŠØē›“ęŽ„é€€å‡ŗļ¼›ęœ‰ę”¹åŠØåˆ™ `git checkout -b ...` + č‡ŖåŠØå¼€ PR——让新 prompt 走标准 PR review ęµēØ‹č€Œäøę˜Æē›“ęŽ„čæ›äø»å¹²ć€‚ + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| `call_agent` 是否被 pytest äøŽ optimizer **å…±ē”ØåŒäø€ä»½ä»£ē ** | 否 → čÆ„ęµ‹äøŽä¼˜åŒ–ēš„ agent äøē­‰ä»·ļ¼›ä¼˜åŒ–ę–¹å‘äøŽå®ˆé—Øę–¹å‘ę¼‚ē§» | +| pytest äøŽ optimizer ę˜Æå¦ä½æē”Ø**åŒäø€ä»½ metric é…ē½®** | 否 → "čÆ„ęµ‹čƒ½čæ‡ä½†ä¼˜åŒ–å™Øēœ‹åˆ°ēš„åˆ†ę•°ä½Ž"ęˆ–åå‘é—®é¢˜ć€‚å»ŗč®® `optimizer.json.evaluate` 段在 pytest é‡Œé€ščæ‡ `test_config_path` å¤ē”Ø | +| evalset ę˜Æå¦ē‰©ē†ę‹†äøŗ train / val 两文件 | 否 → SDK `_validate_inputs` 强制栔验 `train != val`ļ¼Œå¦åˆ™ęŠ„é”™ fail-fast | +| å¤œé—“č„šęœ¬ē»“ęŸę—¶ę˜Æå¦ęœ‰ `git diff` + č‡ŖåŠØå¼€ PR ę­„éŖ¤ | 否 → ä¼˜åŒ–ēš„ prompt ē›“ęŽ„čæ›äø»å¹²ļ¼Œē»•čæ‡ reviewļ¼›å»ŗč®®ę°øčæœčµ° PR 流程 | +| ę˜Æå¦å‡†å¤‡å„½ prompt ę”¹åŠØēš„ē°åŗ¦ē­–ē•„ | å¤šäøšåŠ”ēŗæå…±äŗ«åŒäø€ä»½ prompt ä»“åŗ“ę—¶ļ¼Œå»ŗč®®ę”¹ē”Ø `update_source=False` + äøšåŠ”č‡Ŗęœ‰ē°åŗ¦å‘åøƒå·„å…· | + +**→ å®Œę•“ example**:[`examples/optimization/ci_integration/`](../../../examples/optimization/ci_integration/) +- `agent/agent.py` — pytest äøŽ optimizer å…±äŗ«ēš„ `call_agent` +- `tests/test_agent_quality.py` — pytest å®ˆé—Øå…„å£ļ¼ˆPR é˜¶ę®µč°ƒē”Øļ¼‰ +- `run_optimization.py` — å¤œé—“ä¼˜åŒ–å…„å£ļ¼ˆ`update_source=True`) +- `ci/run_pr_check.sh` / `ci/run_nightly_optimize.sh` — CI 流氓线 shell 兄口 + +### 4.7 ä¼˜åŒ–ä»»åŠ”ęœ‰ē”¬ēŗ¦ęŸļ¼šåæ…é”»åœØęŸę—¶é—“ēŖ—å†…å®Œęˆ / ē“Æč®”č°ƒē”Øäøč¶… N ꬔ / čæžē»­ę— ęå‡å°±åœ {#47} + +**ä½ ēš„å¤„å¢ƒ**ļ¼šä½ ēš„ä¼˜åŒ–ä»»åŠ”č·‘åœØå—ēŗ¦ęŸēš„ēŽÆå¢ƒé‡Œā€”ā€”CI 流氓线必锻 N åˆ†é’Ÿå†…ē»“ęŸć€LLM åŽē«Æé…é¢ęŒ‰ęœˆč®”ē®—å•ę¬”äøčƒ½č·‘ēˆ†ć€čæžē»­č‹„å¹²č½®ę²”ę”¹å–„åŗ”äø»åŠØę”¾å¼ƒåˆ«ęµŖč“¹é¢„ē®—ć€‚**å•äøŖåœę­¢ę”ä»¶äøå¤Ÿ**ļ¼šåŖč®¾ timeout åÆčƒ½é¢„ē®—čæ˜ę²”ē”Øå®Œå°±åœć€åŖč®¾é¢„ē®—åÆčƒ½č·‘åˆ°å¤©č’åœ°č€ć€‚ä½ éœ€č¦"ä»»ę„äø€äøŖ SLO č§¦å‘å°±ē«‹åˆ»åœ"ēš„å¤šé‡åœę­¢ē­–ē•„ć€‚ + +**ęŽ„å…„ęØ”åž‹**:`optimizer.json` ēš„ `optimize.algorithm` ę®µęä¾› 6 ē§ algorithm-level stop conditions,**OR 语义**ā€”ā€”ä»»ę„äø€ę”č§¦å‘å³åœę­¢ć€‚ä½ ęŒ‰äøšåŠ” SLO åęŽØęÆę”é˜ˆå€¼ļ¼Œå¤šäøŖå¼€å…³åŒę—¶åÆē”Øå³åÆć€‚ä¼˜åŒ–ē»“ęŸę—¶ `OptimizeResult.stop_reason` å­—ę®µå‘ŠčÆ‰ä½ å“Ŗę” SLO ęŠ¢é—øļ¼Œä¾æäŗŽåŽē»­č°ƒå‚ć€‚ + +**é…ē½® 3 ę­„**: + +**第 1 ę­„ļ¼šä»Ž 6 ē§ stop condition äø­é€‰å‡ŗäøšåŠ”å…³åæƒēš„å‡ ę”** + +| 字段 | ęŠ¢é—øę”ä»¶ | å…øåž‹äøšåŠ”åœŗę™Æ | +| --- | --- | --- | +| `timeout_seconds` | wall-clock 超过 N ē§’ | CI ęµę°“ēŗæę—¶é—“ēŖ—ē”¬ēŗ¦ęŸļ¼ˆåæ…é”» N åˆ†é’Ÿå†…ē»“ęŸļ¼‰ | +| `max_metric_calls` | 瓯讔 case 评估欔数 ≄ N | LLM åŽē«Æé…é¢ē”¬äøŠé™ | +| `max_candidate_proposals` | reflection LM ē“Æč®”ęč®®ę¬”ę•° ≄ N | é™åˆ¶åę€ LM č°ƒē”Øé¢„ē®— | +| `max_iterations_without_improvement` | čæžē»­ N č½® best valset ę— ęå‡ | å·²ę”¶ę•›ęˆ–é™·å…„å±€éƒØęœ€ä¼˜ę—¶äø»åŠØę”¾å¼ƒ | +| `score_threshold` | best valset pass_rate ≄ 阈值 | å·²č¾¾äøšåŠ”ē›®ę ‡ļ¼Œę— éœ€ē»§ē»­ | +| `max_tracked_candidates` | Pareto å‰ę²æå€™é€‰ę± å¤§å° ≄ N | ęŽ§åˆ¶å†…å­˜äøŽ merge 候选空闓规樔 | + +å„å­—ę®µå®Œę•“å®šä¹‰č§ [§7.3.3](#733-optimizealgorithm-ꮵ)怂**č‡³å°‘é… 1 äøŖ**ā€”ā€”å¦åˆ™ę”†ęž¶åÆåŠØęœŸ fail-fast怂 + +**第 2 ę­„ļ¼šęŒ‰äøšåŠ” SLO åęŽØęÆę”é˜ˆå€¼** + +```jsonc +{ + "optimize": { + "algorithm": { + "timeout_seconds": 90.0, // CI åæ…é”» X åˆ†é’Ÿå†…ē»“ęŸ → 设 X*60 / 2 留缓冲 + "max_metric_calls": 30, // LLM 配额 → ꌉ"č°ƒē”Øę¬”ę•° Ɨ å•ę¬”č€—ę—¶"åē®— + "max_iterations_without_improvement": 3, // čæžē»­ 3 č½®ę— ęå‡å³ę”¾å¼ƒ + "score_threshold": 1.0 // č¾¾åˆ°äøšåŠ”ē›®ę ‡å³åœ + } + } +} +``` + +**äø¤äøŖåęŽØå…³é”®**: + +| 锹 | ę€Žä¹ˆęµ‹ | ę€Žä¹ˆåęŽØ | +| --- | --- | --- | +| å•č½®å…øåž‹č€—ę—¶ | ęµ‹äø€ę¬”åŸŗå‡†č·‘ļ¼Œēœ‹ `runs//result.json` äø­ round ēš„ wall-clock ę—¶é—“ | `timeout_seconds` åŗ”č‡³å°‘äøŗå•č½®č€—ę—¶ Ɨ 2ļ¼Œå¦åˆ™ē¬¬ 1 č½®å°±ęŠ¢é—øēœ‹äøåˆ°ä¼˜åŒ–čæ›å±• | +| å•č½® metric_calls ꕰ | åŒäøŠļ¼Œēœ‹ round ēš„ `metric_calls_in_round` | `max_metric_calls` åŗ”č‡³å°‘čƒ½č·‘čæ‡ `max_iterations_without_improvement` č½®ļ¼Œå¦åˆ™ę°øčæœę˜Æ budget å…ˆęŠ¢é—ø | + +**第 3 ę­„ļ¼šę˜Žē”®ę˜Æå¦å‚äøŽ framework-level metric ę—©åœ** + +| 取值 | 语义 | +| --- | --- | +| `optimize.stop.required_metrics: "all"` ꈖ `["m1"]` | metric č¾¾ threshold ä¹Ÿå‚äøŽ OR ęŠ¢é—ø | +| `optimize.stop.required_metrics: []` | åŖč®© 6 äøŖ algorithm ēŗ§ stopper 决定 | + +äøšåŠ”čÆ‰ę±‚ļ¼š +- **å…³åæƒ metric ę˜Æå¦č¾¾ę ‡**ļ¼ˆå…øåž‹ēš„ prompt č“Øé‡ä¼˜åŒ–ļ¼‰ā†’ 用 `"all"` ęˆ–å…·ä½“åˆ—č”Ø +- **åŖå…³åæƒę—¶é—“ / č°ƒē”Øé¢„ē®—**ļ¼ˆå·²ēŸ„åæ…ę”¶ę•›ć€ēŗÆå”čµ„ęŗļ¼‰ → 用 `[]` + +**`stop_reason` å–å€¼å‚č€ƒ**ļ¼šä¼˜åŒ–ē»“ęŸę—¶ `OptimizeResult.stop_reason` å€¼čƒ½å‘ŠčÆ‰ä½ ęŠ¢é—øč€…ā€”ā€”`score_threshold_reached` / `budget_exhausted` / `timeout_reached` / `no_improvement` / `max_proposals_reached` / `max_tracked_candidates_reached` / `user_requested_stop`ļ¼ˆē”Øęˆ·é€ščæ‡ `optimize.stop` å“Øå…µę–‡ä»¶äø»åŠØč§¦å‘ļ¼‰ć€‚ + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| å„é˜ˆå€¼ę˜Æå¦ē»čæ‡åŸŗå‡†ęµ‹é‡åęŽØć€č€Œéžå‡­ē›“č§‰ę‹č„‘č¢‹ | 否 → å¤§ę¦‚ēŽ‡ęŸę” stopper ę°øčæœå…ˆęŠ¢é—øļ¼ˆå¦‚ timeout 在第 1 č½®å°±č§¦å‘ļ¼‰ļ¼Œå…¶ä»–é…ē½®å½¢åŒč™šč®¾ | +| `timeout_seconds` ę˜Æå¦é¢„ē•™ē¼“å†²ļ¼ˆā‰¤ äøšåŠ”ēœŸå®žēŖ—å£ēš„ 50%) | 否 → ꔆꞶ"å®Œęˆå½“å‰č½®å†åœ"čÆ­ä¹‰äø‹å®žé™…ē»ˆę­¢ę—¶é—“åÆčƒ½č¶…čæ‡ timeout č®¾å®šå€¼ļ¼Œę’žäøšåŠ”ē”¬ęˆŖę­¢ | +| å•č½®å†…ēš„ LLM č°ƒē”Øę˜Æå¦ęœ‰č‡Ŗå·±ēš„č¶…ę—¶ļ¼ˆå¦‚ CLI / HTTP č°ƒē”Øļ¼‰ | 否 → å•č½®å”ä½ę•“äøŖ timeout ä¹ŸåŖčƒ½ē­‰å½“å‰č½®č·‘å®Œļ¼ŒåÆčƒ½äø„é‡č¶…ę—¶ļ¼ˆå‚č€ƒ §4.2 ēš„ CLI_TIMEOUT_SEC ęØ”å¼ļ¼‰ | +| ę˜Æå¦åœØęµ‹čÆ•ēŽÆå¢ƒč·‘čæ‡äø€ę¬”åŸŗå‡†ļ¼ŒéŖŒčÆ `stop_reason` äøŽé¢„ęœŸäø€č‡“ | 否 → 上 CI åŽę‰å‘ēŽ° stopper č”ŒäøŗäøŽé¢„ęœŸäøē¬¦ļ¼Œę— ę³•åæ«é€ŸčÆŠę–­ | + +**→ å®Œę•“ example**:[`examples/optimization/slo_runtime_control/`](../../../examples/optimization/slo_runtime_control/) +- `optimizer.json` — 6 ē§ stop condition å…ØéƒØåÆē”Øēš„é…ē½®čŒƒä¾‹ļ¼ˆäøšåŠ”ēœŸå®žęŽ„å…„åŗ”ę ¹ę®č‡Ŗęœ‰ SLO åęŽØé˜ˆå€¼ļ¼Œäøč¦ē›“ęŽ„å¤åˆ¶ example ēš„å€¼ļ¼‰ +- `run_optimization.py` — č·‘å®ŒåŽ `result.json.stop_reason` å­—ę®µę ‡čÆ†ęŠ¢é—øč€… + +### 4.8 å·²čƒ½č·‘é€šåŸŗē”€ęµēØ‹ļ¼Œęƒ³čæ›äø€ę­„ęå‡ę•ˆęžœļ¼ˆGEPA 候选选择 / Pareto å‰ę²æ / č·Øå­—ę®µčžåˆļ¼‰ {#48} + +**ä½ ēš„å¤„å¢ƒ**ļ¼šä½ å·²ē»ęŒ‰ quickstart č·‘é€šäŗ†åŸŗē”€ä¼˜åŒ–ęµēØ‹ļ¼Œčƒ½ēØ³å®šēœ‹åˆ° baseline → best ēš„ęåˆ†ć€‚ēŽ°åœØęƒ³ē†č§£ GEPA ēš„å‡ äøŖé«˜é˜¶å¼€å…³ā€”ā€”`candidate_selection_strategy` / `frontier_type` / `use_merge` / `skip_perfect_score`ā€”ā€”åœØä½ ēš„ä»»åŠ”äøŠ**åˆ°åŗ•ęœ‰ę²”ęœ‰ē”Øć€čƒ½äøčƒ½å†ę¦Øå‡ŗå‡ äøŖē‚¹**ć€‚ä½†ä½ å•č·‘äø€ę¬”ä¼˜åŒ–å¾€å¾€ēœ‹äøå‡ŗå·®å¼‚ļ¼Œå› äøŗ GEPA åœØå¤šę•°ä»»åŠ”äøŠéƒ½čƒ½ę”¶ę•›åˆ°ē›øčæ‘ `best_pass_rate`——**å·®å¼‚č—åœØåˆ°č¾¾č·Æå¾„é‡Œ**ļ¼ˆč½®ę¬”ę•° / ęŽ„å—ēŽ‡ / merge ę˜Æå¦č§¦å‘ / reflection LM č°ƒē”Øę•°ļ¼‰ļ¼ŒäøåœØęœ€ē»ˆåˆ†ę•°ć€‚ + +**ęŽ„å…„ęØ”åž‹**ļ¼šē”Ø **A/B åÆ¹ē…§å®žéŖŒ**ā€”ā€”åŒäø€ä»½äøšåŠ”ć€åŒäø€ä»½ evalsetć€åŒäø€äøŖ `seed`ļ¼Œč·‘äø¤å„—äøåŒēš„ `optimizer.json`ļ¼šäø€ä»½ę˜Æå½“å‰ēŗæäøŠé…ē½®ęˆ–é»˜č®¤é…ē½®ļ¼ˆbaselineļ¼‰ļ¼Œäø€ä»½ę˜ÆåøŒęœ›éŖŒčÆēš„é«˜é˜¶ē»„åˆļ¼ˆadvancedļ¼‰ć€‚č·‘å®ŒåŽåÆ¹ęÆ”äø¤ę¬”ēš„ `result.json`ļ¼Œå…³ę³Ø**å¤šē»“åŗ¦ęŒ‡ę ‡**č€Œéžå•äø€ `best_pass_rate`怂 + +**å®žéŖŒ 3 ę­„**: + +**第 1 ę­„ļ¼šęŠŠå½“å‰é…ē½®ä½œäøŗ baselineļ¼Œå›ŗå®šå…¶ä½™å˜é‡** + +```jsonc +// optimizer_baseline.json +{ + "optimize": { + "algorithm": { + "seed": 42, // å›ŗå®š seed ęŽ’é™¤éšęœŗę€§ + "max_metric_calls": 30, // äøŽ advanced äæęŒäø€č‡“ä»„å…¬å¹³åÆ¹ęÆ” + "candidate_selection_strategy": "pareto", + "frontier_type": "instance", + "skip_perfect_score": false, + "use_merge": false + } + } +} +``` + +**第 2 ę­„ļ¼šå†™ advanced é…ē½®ļ¼ŒåŖę”¹č¦éŖŒčÆēš„å¼€å…³** + +```jsonc +// optimizer_advanced.jsonļ¼ˆäøŽ baseline 仅差几个开关) +{ + "optimize": { + "algorithm": { + "seed": 42, + "max_metric_calls": 30, + "candidate_selection_strategy": "pareto", + "frontier_type": "objective", // ę”¹ļ¼šä»Ž instance 切到 objective + "skip_perfect_score": true, // ę”¹ļ¼šč·³čæ‡ę»”åˆ† case čŠ‚ēœåę€č°ƒē”Ø + "use_merge": true // ę”¹ļ¼šåÆē”Øč·Øå­—ę®µčžåˆļ¼ˆä»…å¤šå­—ę®µę—¶å®žé™…ē”Ÿę•ˆļ¼‰ + } + } +} +``` + +**第 3 ę­„ļ¼šč·‘äø¤ę¬” + č§£ęž `result.json` č¾“å‡ŗå¤šē»“åŗ¦åÆ¹ęÆ”** + +```bash +python run_baseline.py # 产出 runs/baseline_/result.json +python run_advanced.py # 产出 runs/advanced_/result.json +python compare.py # č§£ęžäø¤ä»½ result.jsonļ¼Œč¾“å‡ŗåÆ¹ęÆ”č”Ø +``` + +`compare.py` åŗ”å…³ę³Øēš„ē»“åŗ¦ļ¼š + +| 结度 | å­—ę®µļ¼ˆ`result.json` äø­ęŒ‰ camelCase 瓢引) | 解读 | +| --- | --- | --- | +| ęœ€ē»ˆč“Øé‡ | `bestPassRate` / `baselinePassRate` | ē«Æåˆ°ē«Æęåˆ†ļ¼›å¤šę•°ä»»åŠ”äøŠäø¤å„—ē­–ē•„ę”¶ę•›ęŽ„čæ‘ | +| ęŽ¢ē“¢ę·±åŗ¦ | `totalRounds` / `roundsAccepted` | ęŽ„å—ēŽ‡ļ¼ˆ`roundsAccepted / totalRounds`ļ¼‰åę˜  frontier ęŽ„å—é—Øę§› | +| merge 蔌为 | `mergeRoundsTotal` / `rounds[*].kind` | 验证 `use_merge=true` ę˜Æå¦ēœŸēš„č§¦å‘ merge | +| åę€é¢„ē®— | `metricCallsTotal` / `proposalsTotal` | `skip_perfect_score=true` åœØå¤§č®­ē»ƒé›† + é«˜åŸŗēŗæčµ·ē‚¹ę—¶čŠ‚ēœę›“ę˜Žę˜¾ | +| `stop_reason` | `stopReason` | å“Ŗę” stopper ęŠ¢é—øļ¼›äø¤å„— advanced/baseline ēš„ stop_reason äøåŒę—¶äøåÆē›“ęŽ„åÆ¹ęÆ” | + +> **čø©å‘ęé†’**:`result.json` äø­å­—ę®µę˜Æ camelCase(`bestPassRate` 而非 `best_pass_rate`)。SDK å†…éƒØē”Ø snake_caseļ¼Œåŗåˆ—åŒ–ę—¶é€ščæ‡ pydantic alias č‡ŖåŠØč½¬ camelCase。读 `result.json` ę—¶ęŒ‰ camelCase 瓢引。 + +**å‡ äøŖé«˜é˜¶å¼€å…³ēš„é¢„ęœŸč”ØēŽ°**ļ¼ˆäøšåŠ”ä»»åŠ”äøŠęœŖåæ…éƒ½ęˆē«‹ā€”ā€”ä»„ä½ č‡Ŗå·±ēš„å®žęµ‹äøŗå‡†ļ¼‰ļ¼š + +| 开关 | ęœŸęœ›ę”¶ē›Š | é€‚ē”Øå‰ę | +| --- | --- | --- | +| `frontier_type="objective"`(vs `"instance"`) | ęŽ„å—ēŽ‡ę›“é«˜ / ęŽ¢ē“¢ę›“ęæ€čæ› | 多 metric åœŗę™Æļ¼›å°č®­ē»ƒé›†ļ¼ˆ< 10 caseļ¼‰äø‹åÆčƒ½čæ‡ę‹Ÿåˆ train minibatch 导臓 valset éœ‡č” | +| `frontier_type="hybrid"` | 多 metric é—“äøäŗ’ē›øč¦†ē›– | 多 metric ēœŸå†²ēŖåœŗę™Æļ¼ˆå‚č§ §4.5) | +| `skip_perfect_score=true` | čŠ‚ēœ reflection LM č°ƒē”Ø | å¤§č§„ęØ”č®­ē»ƒé›† + 高 baseline čµ·ē‚¹ļ¼›å°ę•°ę®é›†äø‹ę»”åˆ† case ęžå°‘ļ¼ŒčŠ‚ēœęœ‰é™ | +| `use_merge=true` | č·Øå­—ę®µčžåˆå€™é€‰ | **ä»…å¤šå­—ę®µļ¼ˆ`add_path` ≄ 2ļ¼‰ę‰ä¼šēœŸå®žč§¦å‘**ļ¼›å•å­—ę®µé…ē½®ę°øčæœ 0 merge round(`mergeRoundsTotal=0` ę˜Æé¢„ęœŸļ¼Œå‚č§ §4.3) | + +**ęŽ„å…„å‰č‡Ŗę£€č”Ø**: + +| ę£€ęŸ„é”¹ | äøé€ščæ‡ēš„åŽęžœ | +| --- | --- | +| äø¤å„—é…ē½®ę˜Æå¦ä»…å·®**č¦éŖŒčÆēš„å‡ äøŖå¼€å…³**ć€å…¶ä½™å…ØéƒØē›øåŒ | 否 → åÆ¹ęÆ”ē»“ęžœå«ę··ę‚å˜é‡ļ¼Œē»“č®ŗäøåÆäæ” | +| `seed` ę˜Æå¦äø¤å„—äø€č‡“ | 否 → å·®å¼‚åÆčƒ½ę„č‡Ŗéšęœŗę€§č€Œéžé…ē½®ē­–ē•„ | +| `max_metric_calls` ę˜Æå¦äø¤å„—äø€č‡“ | 否 → äø€å„—ęœ‰ę›“å¤šé¢„ē®—č‡Ŗē„¶åˆ†ę•°ę›“é«˜ļ¼Œäøčƒ½å½’å› åˆ°ē­–ē•„ | +| ę˜Æå¦åŒę—¶å…³ę³Ø**å¤šē»“åŗ¦åÆ¹ęÆ”**č€Œéžå•äø€ `bestPassRate` | 否 → å¤šę•°ä»»åŠ”äø¤å„—ęœ€ē»ˆåˆ†ę•°ęŽ„čæ‘ļ¼Œēœ‹äøå‡ŗå·®å¼‚ļ¼›å·®å¼‚č—åœØåˆ°č¾¾č·Æå¾„ | +| `use_merge` / `skip_perfect_score` ē­‰å¼€å…³ę˜Æå¦åœØä½ ēš„ä»»åŠ”ē»“ęž„äø‹ęœ‰ę„ä¹‰ | å•å­—ę®µä»»åŠ”å¼€ `use_merge` 永远 0 č§¦å‘ļ¼ˆę— å®³ä½†ę— ę”¶ē›Šļ¼‰ļ¼›é«˜åŸŗēŗæä»»åŠ”å¼€ `skip_perfect_score` čŠ‚ēœåÆč§‚ | + +> é«˜é˜¶é…ē½®**äøę˜Æč¶Šå¤ę‚č¶Šå„½**ć€‚č®øå¤šä»»åŠ”äøŠ baseline é…ē½®å·²čƒ½č¾¾åˆ°åˆē†ę”¶ę•›ļ¼Œadvanced åŖåœØē‰¹å®šä»»åŠ”ē»“ęž„ļ¼ˆå¤šē›®ę ‡ć€å¤šå­—ę®µć€å¤§č§„ęØ”č®­ē»ƒé›†ē­‰ļ¼‰äø‹ę˜¾ē¤ŗä»·å€¼ć€‚**ē”Øę•°ę®å†³å®šļ¼Œäøē”Øē›“č§‰**怂 + +**→ å®Œę•“ example**:[`examples/optimization/advanced_strategies/`](../../../examples/optimization/advanced_strategies/) +- `optimizer_baseline.json` / `optimizer_advanced.json` — A/B åÆ¹ē…§ēš„äø¤å„—é…ē½®ļ¼ˆä»…å·® 3 个开关) +- `run_baseline.py` / `run_advanced.py` — äø¤äøŖē‹¬ē«‹å…„å£ļ¼ˆäæęŒå…¶ä½™å˜é‡äø€č‡“ļ¼‰ +- `compare.py` — č§£ęžäø¤ę¬” `result.json` č¾“å‡ŗå¤šē»“åŗ¦åÆ¹ęÆ”č”Øēš„ę ‡å‡†ęØ”ęæ + + +## 5 GEPA ę˜Æę€Žä¹ˆå·„ä½œēš„ + +č·‘äŗ†äø€ę¬”ä¼˜åŒ–ć€ēœ‹ē€åˆ†ę•°ä»Ž 0.4 涨到 0.85ļ¼Œä½†ä½ äøēŸ„é“**čæ™äø€č·Æę”†ęž¶åˆ°åŗ•å¹²äŗ†ä»€ä¹ˆ**ā€”ā€”å®ƒčÆ»äŗ†å“Ŗäŗ›ę•°ę®ļ¼Ÿåę€ LM ēœ‹åˆ°äŗ†ä»€ä¹ˆļ¼Ÿå‡­ä»€ä¹ˆå†³å®šäæē•™čæ˜ę˜Æäø¢å¼ƒäø€äøŖå€™é€‰ļ¼ŸSLO č§¦å‘ę—¶ę˜Æē«‹åˆ»åœčæ˜ę˜Æē­‰å½“å‰č½®č·‘å®Œļ¼Ÿ + +> **GEPA** = Genetic-Evolutionary Paretoļ¼Œę˜Æäø€äøŖåŸŗäŗŽ**åę€**(reflectionļ¼‰ēš„čæ›åŒ–ęœē“¢ē®—ę³•ļ¼ˆ[gepa-ai/gepa](https://github.com/gepa-ai/gepa),MIT Licenseļ¼‰ć€‚ęœ¬ę”†ęž¶é€ščæ‡ `OPTIMIZER_REGISTRY` 把 `gepa.optimize()` åŒ…ęˆ `GepaReflectiveOptimizer` ęŽ„å…„ļ¼Œå¹¶č”„äø€å±‚ SDK é€‚é…ļ¼ˆčÆ„ä¼°ę”„ęŽ„ć€åę€åé¦ˆęž„é€ ć€åœęœŗåˆ¤å®šć€åŽŸå­č½ē›˜ē­‰ļ¼‰ć€‚ + +### 5.1 äø€č½®ä¼˜åŒ–é‡Œåˆ°åŗ•č·‘äŗ†ä»€ä¹ˆ + +**å…ˆč®°ä½äø‰äøŖč§’č‰²**ā€”ā€”åŽé¢ę‰€ęœ‰å›¾å’Œč”Øéƒ½å›“ē»•čæ™äø‰äøŖļ¼š + +| 角色 | 是谁 | 干什么 | +| --- | --- | --- | +| **agent** | ä½ ēš„äøšåŠ” agentļ¼ˆé€ščæ‡ `call_agent` ęŽ„å…„ļ¼‰ | ęŽ„äø€ę” query č¾“å‡ŗäø€ę”ē­”å¤ | +| **judge / metric** | `evaluate.metrics` é…ē½®ēš„čÆ„ęµ‹å™Ø | ē»™ agent ē­”å¤ę‰“åˆ†ļ¼ˆ0~1) | +| **åę€ LM** | `algorithm.reflection_lm` é…ē½®ēš„ LLM | ēœ‹å¤±č“„ case 反馈 → ē”Ÿęˆę–°ēš„ prompt 候选 | + +**第 0 č½®**ļ¼šē”Ø baseline prompt č·‘ valset → 得到 baseline åˆ†ę•°ļ¼ˆä½ ēš„"起点线") + +**ä¹‹åŽęÆäø€č½®ļ¼ˆreflective round)**ęŒ‰čæ™ 5 歄走: + +```text + +----------------------------+ + | Previous round's prompt | + +--------------+-------------+ + | + v + (1) 抽 minibatch -> 从 trainset éšęœŗęŠ½ N ę” case + (N = reflection_minibatch_size) + | + v + (2) 跑一欔评估 -> ęŠŠå€™é€‰å†™åˆ° prompt ꖇ件 + -> 调 call_agent č·‘čæ™ N ę” case + -> metric ę‰“åˆ†ļ¼Œå¾—åˆ°å¤±č“„ę”ˆä¾‹ + | + v + (3) åę€ LM ē”Ÿęˆę–°å€™é€‰ -> 把失蓄 case åé¦ˆå–‚ē»™åę€ LM + -> å®ƒč¾“å‡ŗę–°ēš„ prompt ę–‡ęœ¬ + | + v + (4) é‡čÆ„ + å…„ Pareto å‰ę²æ -> ę–°å€™é€‰åœØ minibatch äøŠé‡č·‘äø€ę¬” + -> ęÆ”åŽ†å²å€™é€‰å„½å°±å…„å‰ę²æļ¼Œå¦åˆ™äø¢å¼ƒ + | + v + (5) ę£€ęŸ„åœęœŗę”ä»¶ -> 6 äøŖ stopper ä»»äø€č§¦å‘ -> 停 + -> å¦åˆ™čæ›å…„äø‹äø€č½® +``` + +**å‡ ę”å…³é”®čÆ“ę˜Ž**: + +- **第 (2) ę­„ēš„"评估"** å®žé™…č·‘äŗ† `len(minibatch) Ɨ num_runs Ɨ len(metrics)` ꬔ LLM čÆ„ä¼°ļ¼ˆčÆ¦č§ §6.1) +- **第 (3) ę­„ēš„"åę€ LM ēœ‹åˆ°ä»€ä¹ˆ"** å†³å®šę”¹å†™č“Øé‡ā€”ā€”čæ™ę˜Æäø‹äø€čŠ‚ §5.2 ēš„å†…å®¹ +- **第 (4) ę­„ēš„"Pareto å‰ę²æ"** ē®€å•čÆ“å°±ę˜Æ"äæē•™å„ę–¹é¢éƒ½äøč¢«č¶…č¶Šēš„å€™é€‰é›†";具体粒度由 `frontier_type` ęŽ§åˆ¶ļ¼ˆčÆ¦č§ §5.3) +- **第 (5) ę­„ēš„"ä»»äø€č§¦å‘å³åœ"** ęœ‰äøŖē»†čŠ‚ļ¼šč§¦å‘åŽ**ē­‰å½“å‰č½®č·‘å®Œę‰ēœŸę­£åœ**ļ¼Œäøę˜Æē«‹å³ kill(详见 §5.4) +- **valset 评估**ē©æę’åœØäø­é—“å‡ č½®é‡Œå‘ē”Ÿļ¼ˆgepa å†…éƒØå†³å®šä½•ę—¶č·‘ļ¼‰ļ¼Œē”ØäŗŽč®”ē®—"å½“å‰ęœ€ä¼˜å€™é€‰åœØ valset äøŠēš„ēœŸå®žåˆ†ę•°",也是 `score_threshold` / `required_metrics` ē­‰ stopper ēš„åˆ¤ę–­ä¾ę® + +**ē‰¹ę®Šęƒ…å†µļ¼šmerge round** + +`use_merge=true` ę—¶ļ¼ŒęÆéš”č‹„å¹² reflective round ä¼šę’å…„äø€č½® **merge round**ļ¼šä»Ž Pareto å‰ę²æęŒ‘äø¤äøŖå€™é€‰čžåˆęˆäø€äøŖę–°å€™é€‰ļ¼ˆ"取 A åœØå­—ę®µ X äøŠēš„å†™ę³• + B åœØå­—ę®µ Y äøŠēš„å†™ę³•")。**ä»…åœØå¤šå­—ę®µåœŗę™Æäø‹ęœ‰ę„ä¹‰**ā€”ā€”å•å­—ę®µę—¶ę°øčæœäøč§¦å‘ļ¼Œ`mergeRoundsTotal=0` ę˜Æé¢„ęœŸć€‚čÆ¦č§ §4.3怂 + +### 5.2 åę€ LM å®žé™…ēœ‹åˆ°ä»€ä¹ˆ + +åę€ LM 改写 prompt ēš„č“Øé‡ļ¼Œ**å®Œå…Øå–å†³äŗŽå®ƒčƒ½ēœ‹åˆ°å¤šäø°åÆŒēš„å¤±č“„åé¦ˆ**ć€‚å¦‚ęžœåŖå‘ŠčÆ‰å®ƒ"case_3 å¤±č“„äŗ†ļ¼Œåˆ†ę•° 0.3"ļ¼Œå®ƒåŖčƒ½ēžŽēŒœļ¼›å¦‚ęžœå‘ŠčÆ‰å®ƒ"case_3 第 2 turn ę—¶ agent 应输出 `{"city":"上海"}` ä½†å®žé™…č¾“å‡ŗ `Shanghai`ļ¼Œč§„åˆ™č¦ę±‚ case-sensitive ē²¾ē”®åŒ¹é…"ļ¼Œå®ƒå°±čƒ½é’ˆåÆ¹ę€§ę”¹ prompt怂 + +`_AgentGEPAAdapter.make_reflective_dataset` äøŗęÆę”**å¤±č“„ēš„ case** ęø²ęŸ“äø€ä»½ markdown č®°å½•ļ¼Œå–‚ē»™åę€ LMć€‚ęÆę”č®°å½•å­—ę®µļ¼š + +| 字段 | äø€å„čÆč§£é‡Š | ä½•ę—¶å‡ŗēŽ° | +| --- | --- | --- | +| `case_id` | case ēš„ēØ³å®š IDļ¼ˆē”ØäŗŽåę€ LM č·Øę”å¼•ē”Øļ¼‰ | ę€»ę˜Æ | +| `score` | čæ™ę” case ēš„čšåˆåˆ†ę•°ļ¼ˆ0~1,1.0 = å…Ø metric é€ščæ‡ļ¼‰ | ę€»ę˜Æ | +| `Case Body` | å¤±č“„ēŽ°åœŗēš„ markdownļ¼šęÆäøŖ turn äø€ę®µļ¼Œé‡Œé¢ęœ‰ē”Øęˆ·č¾“å…„ć€ęœŸęœ›ē­”å¤ć€agent å®žé™…ē­”å¤ć€tool č°ƒē”Øč½Øčæ¹ć€ęÆę” metric ēš„åˆ¤å®šļ¼ˆPASS/FAIL + åˆ†ę•° + å¤±č“„åŽŸå› ļ¼‰ | ę€»ę˜Æ | +| `Other Active Components` | å½“å‰č½®**äøč¢«ę”¹å†™**ēš„å…¶ä»– prompt å­—ę®µé•æä»€ä¹ˆę · | å¤šå­—ę®µä¼˜åŒ–ę—¶ā€”ā€”č®©åę€ LM åœØę”¹ A ę—¶ēœ‹åˆ° B/C ēŽ°ēŠ¶ļ¼Œéæå…ę”¹åäøŠäø‹ęøøå…¼å®¹ę€§ | +| `history_top_k` | čæ™ę” case åŽ†å²äøŠč·‘å¾—ęœ€å„½ēš„å‡ ę¬” agent ē­”å¤ļ¼ˆęŒ‰åˆ†ę•°ęŽ’ļ¼‰ | `reflection_history_top_k > 0` ę—¶ | + +**`Case Body` ēš„å…·ä½“ē»“ęž„**: + +```text +### Turn 1 +**User**: <ē”Øęˆ·åŽŸå§‹č¾“å…„> +**Expected**: <ęœŸęœ›ē­”å¤> +**Agent Response**: +**Tool Trace**: ← ä»…ęœ‰ tool č°ƒē”Øę—¶ + - tool_name(args) → response +**Verdict** (Turn 1): + [FAIL] metric_name: score=0.0000, threshold=1.0000 + reason: agent output not byte-equal to expected (case-sensitive) + Ā· rubric[no_emoji]: PASS score=1.00 ← 仅 LLM rubric metric + +### Turn 2 +... + +### Overall (case-level aggregate) ← 多 turn ęˆ–å¤š run ę—¶ +... +``` + +**åÆ¹ē”®å®šę€§ metric ēš„å¤±č“„åŽŸå› åˆęˆ**ļ¼šå½“ metric 是 `final_response_avg_score` čæ™ē±»äøåø¦ LLM judge ēš„čÆ„ęµ‹å™Øć€åŖč¾“å‡ŗ score+status ę—¶ļ¼Œę”†ęž¶ä¼š**č‡ŖåŠØåˆęˆäø€å„å¤±č“„čÆ“ę˜Ž**ļ¼ˆä¾‹å¦‚ļ¼š`agent output not byte-equal to expected (case-sensitive)` / `expected substring not contained in agent output (case-insensitive)` / `JSON structural comparison failed`ļ¼‰ļ¼Œč®©åę€ LM ē›“ęŽ„ēœ‹åˆ°**äøŗä»€ä¹ˆę²” match**ļ¼Œč€Œäøåæ…č‡Ŗå·± diff ę–‡ęœ¬åŽ»ēŒœć€‚ + +> ęƒ³ēœ‹åę€ LM å®žé™…ę‹æåˆ°ēš„ prompt å…Øč²Œļ¼Ÿč·‘ä¼˜åŒ–ę—¶ęŠŠ `verbose=2` ę‰“å¼€ļ¼Œgepa å†…éƒØę—„åæ—ä¼šé™„åø¦ęÆč½®ēš„åę€ prompt ę–‡ęœ¬ā€”ā€”čÆ»äø€ę¬”åæƒé‡Œå°±ęœ‰ę•°äŗ†ć€‚ + +### 5.3 5 äøŖę øåæƒē®—å­ēš„å®žé™…č”Œäøŗ + +`optimizer.json` ēš„ `optimize.algorithm` ę®µé‡Œļ¼Œęœ€åøøč¢«é—®åˆ°ēš„ 5 äøŖå¼€å…³ļ¼ŒåœØęŗē é‡Œåˆ°åŗ•å¹²ä»€ä¹ˆļ¼š + +| 算子 | äø€å„čÆåŠŸčƒ½ | č°ƒå®ƒēš„å…øåž‹åŠØęœŗ | čÆ¦ē»†å‚č€ƒ | +| --- | --- | --- | --- | +| `reflection_minibatch_size` | ęÆč½®åę€ LM ēœ‹å‡ ę” case | č°ƒå°ēœ tokenļ¼Œč°ƒå¤§č®©åę€ LM č§†é‡Žę›“å…Ø | [§7.3.3](#733-optimizealgorithm-ꮵ) | +| `module_selector` | å¤šå­—ę®µę—¶čæ™äø€č½®ę”¹å“ŖäøŖå­—ę®µļ¼ˆ`round_robin` č½®ę¢ / `all` 全选 / `random` éšęœŗļ¼‰ | ęƒ³ęø…ę™°å½’å› ęÆäøŖå­—ę®µč“”ēŒ® → `round_robin` | [§4.3](#43) | +| `frontier_type` | Pareto å‰ę²æē²’åŗ¦ļ¼ˆ`instance` ęÆ case 一个 best / `objective` ęÆ metric 一个 / `hybrid` 双层 / `cartesian` ē¬›å”å°”ē§Æļ¼‰ | 多 metric ēœŸå†²ēŖę—¶ → `hybrid` | [§4.5](#45) | +| `candidate_selection_strategy` | äø‹äø€č½®åę€ēš„ parent ę€Žä¹ˆęŒ‘ļ¼ˆ`pareto` é»˜č®¤ä»Žå‰ę²æęŒ‘ / `current_best` ē”Øå½“å‰ęœ€ä¼˜ / 等) | ęƒ³åŠ åæ«ę”¶ę•›ęˆ–åŠ å¤§ęŽ¢ē“¢ | [§7.3.3](#733-optimizealgorithm-ꮵ) | +| `use_merge` + `max_merge_invocations` | ę˜Æå¦åÆē”Øč·Øå­—ę®µčžåˆ + č§¦å‘ę¬”ę•°äøŠé™ | **ä»…å¤šå­—ę®µę‰ēœŸč§¦å‘**ā€”ā€”å•å­—ę®µäø‹ `mergeRoundsTotal=0` ę˜Æé¢„ęœŸ | [§4.3](#43) / [§4.8](#48) | + +### 5.4 åœęœŗę—¶ęœŗļ¼šå®Œęˆå½“å‰č½®å†åœ + +6 ē§ algorithm ēŗ§åœęœŗę”ä»¶ļ¼ˆ`max_metric_calls` / `timeout_seconds` / `no_improvement` / `score_threshold` / `max_candidate_proposals` / `max_tracked_candidates`ļ¼‰åœØęÆč½®ē»“ęŸę—¶**åŒę­„ę£€ęŸ„**ā€”ā€”ä»»äø€ę”ä»¶ę»”č¶³å³åœć€‚ + +**3 äøŖå®¹ę˜“čø©ēš„ē»†čŠ‚**: + +| ē»†čŠ‚ | 含义 | ę€Žä¹ˆéæé›· | +| --- | --- | --- | +| **äøē«‹å³ kill å½“å‰č½®** | č§¦å‘åœęœŗę—¶äøä¼šęŠŠę­£åœØč·‘ēš„ round äø­ę–­ļ¼›č¦ē­‰å½“å‰ round č·‘å®Œę‰ēœŸę­£åœ | SLO ē”¬ęˆŖę­¢åœŗę™Æäø‹ļ¼Œ`timeout_seconds` č®¾äøŗäøšåŠ”ēœŸå®žēŖ—å£ēš„ 50% å·¦å³ļ¼Œē•™ē¼“å†² | +| **å®žé™…ē»ˆę­¢ę—¶é—“åøøč¶…čæ‡ `timeout_seconds`** | äøŠäø€ę”ēš„ē›“ęŽ„åŽęžœā€”ā€”å”åœØé•æ round é‡Œę—¶å°¤å…¶ę˜Žę˜¾ | ē»™ `call_agent` å†…éƒØēš„ LLM č°ƒē”ØåŠ č‡Ŗå·±ēš„č¶…ę—¶ļ¼ˆå‚č€ƒ §4.2 CLI ēš„ 90s 超时) | +| **多个 stopper åŒę—¶č§¦å‘ēš„ä¼˜å…ˆēŗ§** | `framework_stopper`(`required_metrics` ę”æē­–ļ¼‰ä¼˜å…ˆļ¼›å…¶ę¬”ęŒ‰ algorithm ēŗ§ stopper ēš„ę’å…„é”ŗåŗå–ē¬¬äø€äøŖ | `OptimizeResult.stop_reason` å­—ę®µč®°å½•ęŠ¢é—øč€…ļ¼Œč·‘å®Œē›“ęŽ„ēœ‹å°±ēŸ„é“ę˜Æå“Ŗę”č§¦å‘ēš„ | + +**`stop_reason` å–å€¼å‚č€ƒ**(`OptimizeResult.stop_reason`ļ¼‰ļ¼š + +``` +required_metrics_passing ← framework ēŗ§ļ¼ˆęœ€é«˜ä¼˜å…ˆēŗ§ļ¼‰ +score_threshold ← č¾¾åˆ°ē›®ę ‡åˆ† +budget_exhausted ← max_metric_calls +timeout ← timeout_seconds +no_improvement ← max_iterations_without_improvement +max_candidate_proposals +max_tracked_candidates +user_requested_stop ← ē”Øęˆ· touch äŗ† optimize.stop ꖇ件 +completed ← ę²”ęœ‰ stopper č§¦å‘ļ¼Œgepa č‡Ŗē„¶č·‘å®Œ +``` + +### 5.5 äø€ē§ē‰¹ę®Šęƒ…å†µļ¼šFAILED + +ę­£åøøęƒ…å†µäø‹ `OptimizeResult.status = "SUCCEEDED"`——gepa č·‘å®Œäŗ†å¾ŖēŽÆļ¼ˆč‡Ŗē„¶ē»“ęŸ / stopper č§¦å‘éƒ½ē®—ļ¼‰ć€‚ä½†ęœ‰äø€ē§ē‰¹ę®ŠēŠ¶ę€å€¼å¾—ē”Øęˆ·å…³ę³Øļ¼š + +- **`status = "FAILED"`**:gepa åœØč·‘ēš„čæ‡ēØ‹äø­ęŠ›äŗ†å¼‚åøøļ¼ˆęœ€åøøč§ļ¼šč®­ē»ƒ/éŖŒčÆé›†åŠ č½½å¤±č“„ć€`gepa.optimize()` å†…éƒØå¼‚åøøć€åę€ LM č°ƒē”Øå¤±č“„ļ¼‰ +- **ꭤꗶ `best_prompts` 强制设为 `baseline_prompts`**ā€”ā€”äæčÆä½ ę‹æåˆ°ēš„äŗ§ē‰©**ę°øčæœäøä¼šęÆ” baseline å·®** +- **`update_source=True` 在 FAILED ę—¶äøä¼šå›žå†™**源 prompt ę–‡ä»¶ļ¼ˆčÆ¦č§ §3.4 决策蔨) + +å¦äø€äøŖę˜“ę··ē‚¹ę˜Æ"č·‘å®Œäŗ†ä½†ę²”ę”¹å–„"ļ¼ščæ™ē§ęƒ…å†µ `status` ä»ę˜Æ `"SUCCEEDED"`ļ¼Œä½† `finish_reason="no_improvement"`ļ¼Œäø” `best_prompts == baseline_prompts`——summary.txt é‡Œä¼šę˜¾ē¤ŗ `baseline → baseline`ļ¼ˆę²”é€€åŒ–ä¹Ÿę²”ęå‡ļ¼‰ć€‚čæ™ę˜Æé¢„ęœŸļ¼Œäøę˜Æ bug怂 + + + +## 6 ęˆęœ¬äøŽå¹¶å‘ + +č·‘äø€ę¬”ä¼˜åŒ–č¦å¤šå°‘ LLM č°ƒē”Øļ¼Ÿå“Ŗäŗ›ę—‹é’®å½±å“č°ƒē”Øé‡ć€å“Ŗäŗ›å½±å“å¹¶å‘é‡ć€å“Ŗäŗ›äø¤č€…éƒ½å½±å“ļ¼Ÿ + +### 6.1 äø€ę¬”ä¼˜åŒ–ēš„ LLM č°ƒē”Øä»Žå“Ŗę„ + +LLM č°ƒē”Øåˆ†äø¤å—ā€”ā€”**čÆ„ä¼°ä¾§åƒē»å¤§éƒØåˆ†**ļ¼Œåę€ä¾§é›¶å¤“ļ¼š + +**čÆ„ä¼°ä¾§ļ¼ˆagent + judge)**ļ¼šč·‘čæ™äŗ›äŗ‹å„č°ƒäø€ę¬” LLM—— + +```text +跑一欔 baseline čÆ„ä¼°ļ¼š valset å…Øč·‘äø€é ← čµ·ē‚¹ļ¼Œ1 ꬔ +ęÆäøŖ reflective round: 抽 N ę” case č·‘äø€é + ę–°å€™é€‰é‡č·‘ ← 主要成本 +ē‰¹å®šēš„ reflective round: 在 valset äøŠé‡čÆ„å½“å‰ęœ€ä¼˜å€™é€‰ ← gepa å†³å®šä½•ę—¶č·‘ +``` + +ęÆę¬”"č·‘äø€é"å®žé™…č§¦å‘ēš„ LLM č°ƒē”Øę•° = **case ꕰ Ɨ ęÆę” case ēš„ agent č°ƒē”Øę•° Ɨ `num_runs` Ɨ ęÆę” metric ēš„ judge č°ƒē”Øę•°**ć€‚å…¶äø­ļ¼š + +| ä¹˜ę•° | ę„ęŗ | å…øåž‹å–å€¼ | +| --- | --- | --- | +| ęÆę” case ēš„ agent č°ƒē”Øę•° | evalset ę•°ę®ļ¼›å¤šč½® conversation ę—¶ęŒ‰ turn ę•°ē“ÆåŠ  | 单 turn = 1,多 turn = N | +| `evaluate.num_runs` | č®©ęÆę” case č·‘å‡ ę¬”å–å‡å€¼ę¶ˆé™¤ LLM 输出方差 | 1ļ¼ˆé»˜č®¤ļ¼Œēœļ¼‰/ 2~3ļ¼ˆęŽØčļ¼ŒēØ³ļ¼‰ | +| ęÆę” metric ēš„ judge č°ƒē”Øę•° | ēœ‹ metric ē±»åž‹ļ¼š`final_response_avg_score` ē±»ē”®å®šę€§åŒ¹é… = 0 欔;`llm_judge` / `llm_rubric_response` ≄ 1 欔(`judge_models` ę•°ē»„é‡Œå‡ äøŖå°±ę˜Æå‡ ę¬”ļ¼‰ | 0~3 | + +**åę€ä¾§ļ¼ˆreflection LM)**: + +```text +ęÆäøŖ reflective round: 1 ę¬”ļ¼ˆē”Ÿęˆę–°å€™é€‰ prompt) +ęÆäøŖ merge round: 1 ę¬”ļ¼ˆä»… use_merge=true äø”å¤šå­—ę®µę—¶ę‰ęœ‰ merge round) +``` + +åę€ä¾§č°ƒē”Øę•°čæœå°‘äŗŽčÆ„ä¼°ä¾§ā€”ā€”é€šåøøäø€ę¬”å®Œę•“ä¼˜åŒ–åę€ LM 也就 5~20 ꬔ怂 + +### 6.2 č·‘å®ŒåŽä»Ž result.json čÆ»åˆ°ä»€ä¹ˆ + +`OptimizeResult` é‡Œå®žé™…č®°å½•ēš„ē»Ÿč®”å­—ę®µļ¼ˆäŗ§ē‰© `result.json` 里 camelCase ē“¢å¼•ļ¼‰ļ¼š + +| 字段 | 含义 | +| --- | --- | +| `totalMetricCalls` | gepa ē“Æč®”ēš„ case-level 评估欔数 | +| `totalReflectionLmCalls` | åę€ LM ē“Æč®”č°ƒē”Øę¬”ę•°ļ¼ˆå«é‡čÆ•ļ¼‰ | +| `totalTokenUsage` | åę€ LM 瓯讔 token:`{prompt, completion, total}` | +| `durationSeconds` | ꀻ wall-clock 耗ꗶ | + +éœ€č¦ä¼°ē®—äøšåŠ”ä¾§ēš„å®žé™… USD ęˆęœ¬ę—¶ļ¼Œē”Ø `totalTokenUsage` Ɨ LLM åŽē«Æå•ä»·åē®—åę€ä¾§ļ¼›agent / judge 侧从 LLM åŽē«Æē”Øé‡č®°å½•äø­ę‹‰å–ļ¼ˆAPI ęŽ§åˆ¶å° / billing ęŠ„č”Øļ¼‰ć€‚ + +### 6.3 4 äøŖåøøē”Øę—‹é’®ēš„ä¹˜ę•°ę•ˆåŗ” + +ꌉ"åÆ¹ę€»č°ƒē”Øé‡ēš„å½±å“å€ēŽ‡"ä»Žå¤§åˆ°å°ęŽ’ā€”ā€”é‡åˆ°ä¼˜åŒ–č·‘ēˆ†é¢„ē®—ę—¶ļ¼Œå…ˆč°ƒäøŠé¢ēš„ļ¼š + +| ę—‹é’® | 乘多少 | č°ƒå°ēš„ä»£ä»· | 详细 | +| --- | --- | --- | --- | +| `algorithm.max_metric_calls` | **ę€»č°ƒē”Øé‡ēš„ē”¬äøŠé™**——gepa 瓯讔达到就停 | å¤Ŗå°ā†’ä¼˜åŒ–ē¬¬ 1 č½®å°±č¢«å®ƒåœļ¼›ēœ‹äøåˆ°ä»»ä½•ęåˆ† | [§4.7](#47) | +| `evaluate.num_runs` | **乘 N**ā€”ā€”ęÆę” case č·‘ N ę¬”å–å‡å€¼ | 1 ę—¶ LLM č¾“å‡ŗę–¹å·®ē›“ęŽ„čæ›å…„åˆ†ę•°ļ¼ˆåŒ prompt äø¤ę¬”č·‘åˆ†äøäø€ę ·ļ¼‰ļ¼›å»ŗč®® ≄ 2 | [§4.5](#45) | +| `optimize.eval_case_parallelism` | **äøå½±å“ę€»é‡**ļ¼ŒåŖå½±å“**å¢™é’Ÿę—¶é—“**和**ēž¬ę—¶ QPS** | č°ƒé«˜ēœę—¶é—“ä½†å®¹ę˜“ę’ž LLM åŽē«Æ rate limit | [§4.5](#45) | +| `algorithm.reflection_minibatch_size` | **ä¹˜å‡ ę”**ā€”ā€”ęÆč½®åę€ LM ēœ‹å‡ ę” caseļ¼›čÆ„ä¼°ä¾§ä¹ŸęŒ‰čæ™äøŖę•°ē®— | å¤Ŗå¤§ā†’åę€ prompt ę’‘ēˆ† LLM äøŠäø‹ę–‡ēŖ—å£ | [§4.3](#43) | + +### 6.4 ęƒ³åˆē†č®¾é˜ˆå€¼ļ¼Ÿå…ˆč·‘äø€ę¬”åŸŗå‡† + +设 `timeout_seconds` / `max_metric_calls` ē­‰é˜ˆå€¼å‰ļ¼Œ**å…ˆęŒ‰é»˜č®¤é…ē½®č·‘äø€ę¬”åŸŗå‡†**ā€”ā€”ä»Žäŗ§ē‰©é‡ŒčÆ»äø¤äøŖę•°ļ¼š + +| č¦ęµ‹ēš„å€¼ | ę€Žä¹ˆęµ‹ | ę€Žä¹ˆē”Ø | +| --- | --- | --- | +| **å•č½®å…øåž‹č€—ę—¶** | `runs//result.json` 里 `rounds[*].durationSeconds`ļ¼ˆå–äø­ä½ę•°ļ¼‰ | `timeout_seconds` č‡³å°‘č®¾äøŗå•č½®č€—ę—¶ Ɨ 2ļ¼Œå¦åˆ™ē¬¬ 1 č½®å°±ęŠ¢é—øēœ‹äøåˆ°ä¼˜åŒ–čæ›å±• | +| **å•č½® metric_calls** | 同上,`totalMetricCalls / totalRounds` | `max_metric_calls` č‡³å°‘čƒ½č·‘čæ‡ `max_iterations_without_improvement` č½®ļ¼Œå¦åˆ™ę°øčæœę˜Æ budget å…ˆęŠ¢é—ø | + +**例**ļ¼šåŸŗå‡†č·‘ę˜¾ē¤ŗå•č½® 30 ē§’ć€å•č½® 4 ꬔ metric_calls,CI ēŖ—å£ 5 åˆ†é’Ÿā€”ā€”é‚£ä¹ˆ `timeout_seconds=120`ļ¼ˆē•™ē¼“å†²ļ¼‰ć€`max_metric_calls=24`ļ¼ˆč·‘ 6 轮够 `max_iterations_without_improvement=3` ęŠ¢é—øļ¼‰ć€‚ + +### 6.5 å•č½®ēž¬ę—¶ LLM QPS ęŽ§åˆ¶ + +å•č½®å†…å¹¶å‘č·‘å‡ŗēš„ LLM čÆ·ę±‚ę•°ļ¼š + +```text +å•č½®ēž¬ę—¶ LLM QPS ā‰ˆ eval_case_parallelism (å¹¶č”Œč·‘å‡ ę” case) + Ɨ num_runs (ęÆę” case 跑几欔) + Ɨ (ęÆę” case ēš„ agent č°ƒē”Øę•° + ꉀ꜉ judge č°ƒē”Øę•°) +``` + +**å…øåž‹åœŗę™Æä¼°ē®—**:3 äøŖ judge + `num_runs=2` + `eval_case_parallelism=4` + ęÆ case 1 ꬔ agent č°ƒē”Ø + 3 ꬔ judge č°ƒē”Ø → å•č½®ēž¬ę—¶ēŗ¦ 32 ꬔ LLM 请求。当 LLM åŽē«Æ rate limit äøŗ 30 QPS ę—¶čÆ„é…ē½®åæ…ē„¶č§¦å‘é™ęµć€‚ + +**ęŽ§åˆ¶ēž¬ę—¶ QPS ēš„äø¤äøŖå‚ę•°**ļ¼ˆęŒ‰ę•ˆęžœęŽ’åŗļ¼‰ļ¼š + +| å‚ę•° | 影响 | 适用 | +| --- | --- | --- | +| `eval_case_parallelism` | ē›“ęŽ„é™ä½Žå¹¶å‘ case ꕰ | å¤§å¤šę•°ęƒ…å†µé¦–é€‰ļ¼›é»‘ē›’ CLI态multi-judge ē­‰å• case č°ƒē”ØåÆ†é›†ēš„åœŗę™Æäø‹č®¾äøŗ `1` äø²č”Œę‰§č”Œļ¼ˆčÆ¦č§ [§4.2](#42)态[§4.5](#45)) | +| `num_runs` | å‡å°‘ęÆę” case ēš„é‡å¤čÆ„ä¼° | ē‰ŗē‰²äø€å®šēš„ę–¹å·®ēØ³å®šę€§ļ¼›å»ŗč®®åœØē”®č®¤ LLM č¾“å‡ŗę–¹å·®č¾ƒå°åŽę‰äø‹č°ƒ | + +### 6.6 åę€ LM é€‰åž‹äøŽé…ē½® + +åę€ LM ēš„č¾“å‡ŗč“Øé‡ē›“ęŽ„å†³å®š prompt ę”¹å†™č“Øé‡ć€‚é…ē½®ä½ē½®ļ¼ˆ`optimizer.json`ļ¼‰ļ¼š + +```jsonc +{ + "optimize": { + "algorithm": { + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, // åę€ prompt č¾ƒé•æļ¼Œē•™å¤Ÿč¾“å‡ŗē©ŗé—“ + "temperature": 0.6 // 0.6~0.8 ä¹‹é—“ļ¼Œč®© LM ęœ‰åˆ›é€ ę€§ + } + } + } + } +} +``` + +**äø¤ę”å»ŗč®®**: + +- **åÆäøŽ agent / judge ē‹¬ē«‹é…ē½®**——`reflection_lm` ę®µę˜Æē‹¬ē«‹ēš„ļ¼Œbusiness åÆä»„é€‰äøåŒēš„ modelļ¼ˆéæå…"自评"åå·®ļ¼Œęˆ–č€…ēŗÆē²¹å› äøŗ reflection ä»»åŠ”åÆ¹ęØ”åž‹ęŽØē†åŠ›č¦ę±‚ę›“é«˜ļ¼‰ +- **token ē”Øé‡ēœŸå®žč®°å½•**——`totalTokenUsage` å­—ę®µä¼šē“Æč®”åę€ LM ēš„å®žé™… prompt + completion + total token ę•°ļ¼›ęŒ‰ LLM åŽē«Æå•ä»·åē®— USD å³åÆ + + + +## 7 å®Œę•“ API å‚č€ƒ + +å·„å…·ä¹¦ē« čŠ‚ļ¼ŒęŒ‰"ęƒ³ę‰¾ä»€ä¹ˆå‚ę•°"组织。**ęÆäøŖč”Øéƒ½ęœ‰"必唫"列**ļ¼Œäø‰ę”£å«ä¹‰ļ¼š + +- **必唫**ļ¼šäøä¼ /äøé… → 启动期 fail-fast ꊄ错 +- **选唫**ļ¼šåÆäøé…ļ¼›äøé…čµ°é»˜č®¤å€¼ +- **ę”ä»¶åæ…å”«**ļ¼šå•ēœ‹ę”ē›®åÆäøé…ļ¼Œä½†**ę»”č¶³ęŸę”ä»¶ę—¶åæ…é”»é…**ā€”ā€”ę”ä»¶å†™åœØę”ē›®ęœ«å°¾ēš„"ę”ä»¶"列 + +ę‰€ęœ‰å­—ę®µéƒ½åŸŗäŗŽå®žé™…ęŗē ļ¼ˆęÆäøŖč”Øå¤“ę ‡ę³Øęŗę–‡ä»¶č·Æå¾„ļ¼‰ć€‚ + +### 7.1 `AgentOptimizer.optimize` å‚ę•°č”Ø + +源码:`trpc_agent_sdk/evaluation/_agent_optimizer.py:AgentOptimizer.optimize`怂**11 äøŖ keyword-only å‚ę•°**——必锻用 `key=value` å½¢å¼ä¼ ļ¼ŒäøęŽ„å—ä½ē½®å‚ę•°ć€‚ + +| å‚ę•° | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `config_path` | **必唫** | `str` | — | optimizer.json é…ē½®ę–‡ä»¶č·Æå¾„ | +| `call_agent` | **必唫** | `async (str) -> str` | — | 业劔 agent é€‚é…å‡½ę•°ļ¼›ē­¾åå›ŗå®šäøŗ"ꎄ query čæ”å›ž str" | +| `target_prompt` | **必唫** | `TargetPrompt` | — | ę³Øå†Œå“Ŗäŗ› prompt å­—ę®µę˜Æä¼˜åŒ–ē›®ę ‡ļ¼ˆč‡³å°‘ 1 äøŖļ¼Œå¦åˆ™ęŠ„é”™ļ¼‰ | +| `train_dataset_path` | **必唫** | `str` | — | 训练 evalset 文件路径 | +| `validation_dataset_path` | **必唫** | `str` | — | 验证 evalset 文件路径;**åæ…é”»äøŽ `train_dataset_path` äøåŒ**ļ¼ˆé˜²ę•°ę®ę³„ę¼ļ¼Œę”†ęž¶ä¼šč§„čŒƒåŒ–č·Æå¾„å†ęÆ”åÆ¹ļ¼‰ | +| `output_dir` | **必唫** | `str` | — | äŗ§ē‰©ē›®å½•ļ¼›äøå­˜åœØč‡ŖåŠØåˆ›å»ŗ | +| `callbacks` | 选唫 | `Optional[Callbacks]` | `None` | čÆ„ęµ‹å™Øē”Ÿå‘½å‘ØęœŸå›žč°ƒļ¼ˆå°‘ē”Øļ¼‰ | +| `update_source` | 选唫 | `bool` | `False` | ä¼˜åŒ–ęˆåŠŸåŽę˜Æå¦å›žå†™ęŗ prompt ę–‡ä»¶ļ¼ˆå†³ē­–č”Øč§ [§3.4](#34-agentoptimizer)) | +| `verbose` | 选唫 | `int` | `1` | ē»ˆē«Æč¾“å‡ŗčÆ¦ē»†åŗ¦ļ¼š`0` 静默 / `1` 默认 Rich é¢ęæ / `2` 加 gepa å†…éƒØę—„åæ—č½¬å‘ | +| `extra_stop_callbacks` | 选唫 | `Optional[Sequence]` | `None` | čæč”Œę—¶čæ½åŠ ēš„ stopper(SLO ē›‘ęŽ§ / kill switch ē­‰ļ¼‰ļ¼›ę™®é€š callable 显示为 `stop_reason="completed"`ļ¼Œéœ€ēØ³å®šę ‡ē­¾ę—¶ē”Ø `_LabeledStopper` åŒ…č£…ęˆ–ęš“éœ² `.label` å±žę€§ | +| `extra_gepa_callbacks` | 选唫 | `Optional[Sequence]` | `None` | čæč”Œę—¶čæ½åŠ ēš„ gepa äŗ‹ä»¶ callbackļ¼ˆå¦‚č½¬å‘åˆ° dashboardļ¼‰ļ¼›éœ€å®žēŽ° `gepa.core.callback.GEPACallback` åč®® | + +**čæ”å›žå€¼**:`OptimizeResult`(详见 [§7.4](#74-optimizeresult--roundrecord-字段蔨))。 + +**启动期 fail-fast ę£€ęŸ„**(`_validate_inputs`ļ¼‰ļ¼š + +| ę£€ęŸ„äøé€ščæ‡ēš„ęƒ…å†µ | ęŠ›å‡ŗ | +| --- | --- | +| `output_dir` ę˜Æē©ŗå­—ē¬¦äø² | `ValueError` | +| `target_prompt` ę²”ę³Øå†Œä»»ä½•å­—ę®µ | `ValueError` | +| `call_agent` äøę˜Æ async å‡½ę•°ļ¼ˆå« `__wrapped__` ę£€ęŸ„ļ¼Œę”ÆęŒ `functools.partial` åŒ…č£…ēš„ async) | `TypeError` | +| `train_dataset_path` äøŽ `validation_dataset_path` č§£ęžåŽę˜ÆåŒäø€äøŖę–‡ä»¶ļ¼ˆē”Ø `os.path.normpath(os.path.abspath(...))` č§„čŒƒåŒ–åŽęÆ”åÆ¹ļ¼‰ | `ValueError`ļ¼ˆé˜²ę•°ę®ę³„ę¼ļ¼‰ | +| `evaluate.metrics` 含 `tool_trajectory_avg_score` ꈖ `llm_rubric_knowledge_recall`ā€”ā€”čæ™äæ©éœ€č¦ session traces / tool intermediate_data,`call_agent` é»‘ē›’ęØ”å¼ę‹æäøåˆ° | `ValueError` | +| é…ē½®äø­ `algorithm.name` äøåœØ `OPTIMIZER_REGISTRY` ę³Øå†Œčæ‡ | `ValueError`ļ¼ˆę¶ˆęÆåˆ—å‡ŗę‰€ęœ‰å·²ę³Øå†Œē®—ę³•åļ¼‰ | +| `use_merge=true` äø” `TargetPrompt` 字段数 < 2 | `UserWarning`ļ¼ˆäøč‡“å‘½ļ¼Œä½† `mergeRoundsTotal` ä¼šäø€ē›“ę˜Æ 0) | + +### 7.2 `TargetPrompt` API 蔨 + +源码:`trpc_agent_sdk/evaluation/_target_prompt.py`ć€‚äø€äøŖę³Øå†Œå¤šå­—ę®µ prompt ēš„å®¹å™Øļ¼Œę”ÆęŒę–‡ä»¶ęŗå’Œå›žč°ƒęŗäø¤ē§å½¢ę€ć€‚ + +| 方法 | ē­¾å | 蔌为 | +| --- | --- | --- | +| `add_path(name, path)` | `(str, str) -> Self` | ę³Øå†Œę–‡ä»¶ęŗå­—ę®µļ¼›`name` åæ…é”»å”Æäø€ļ¼›čæ”å›ž self ä¾›é“¾å¼č°ƒē”Ø | +| `add_callback(name, *, read, write)` | `(str, *, AsyncRead, AsyncWrite) -> Self` | ę³Øå†Œå›žč°ƒęŗå­—ę®µļ¼›`read: async () -> str`态`write: async (str) -> None` åæ…é”»éƒ½ę˜Æ asyncļ¼›`name` 必锻唯一 | +| `names()` | `() -> list[str]` | čæ”å›žå­—ę®µåļ¼ˆęŒ‰ę³Øå†Œé”ŗåŗļ¼‰ | +| `describe_source(name)` | `(str) -> str` | ę–‡ä»¶ęŗčæ”å›žč·Æå¾„ļ¼›å›žč°ƒęŗčæ”å›žå­—é¢é‡ `""`ļ¼›ęœŖēŸ„ name ꊛ `KeyError` | +| `read(name)` | `async (str) -> str` | čÆ»å–å•äøŖå­—ę®µ | +| `read_all()` | `async () -> dict[str, str]` | čÆ»å–å…ØéƒØå­—ę®µļ¼ˆęŒ‰ę³Øå†Œé”ŗåŗļ¼‰ | +| `write_all(prompts)` | `async (dict[str, str]) -> None` | **åŽŸå­å†™å…„å…ØéƒØå­—ę®µ**ļ¼ˆčÆ¦č§äø‹ę–¹å„‘ēŗ¦ļ¼‰ | + +**`write_all` ēš„åŽŸå­ę€§å„‘ēŗ¦**ļ¼ˆę„č‡Ŗęŗē ę³Øé‡Šļ¼‰ļ¼š + +1. **ę–‡ä»¶ęŗåŽŸå­å†™**ļ¼šå…ˆå†™åˆ° `.tmp`ļ¼Œå† `os.replace` é‡å‘½åļ¼ˆPOSIX äæčÆ rename åŽŸå­ļ¼‰ +2. **å¤±č“„å›žę»š**ļ¼šä»»äø€ę–‡ä»¶å†™å¤±č“„ę—¶ļ¼Œå·²å†™ęˆåŠŸēš„ę–‡ä»¶å›žę»šåˆ° pre-call 内容、清理残留 `.tmp`ļ¼ŒåŽŸå¼‚åøøę­£åøøäøŠęŠ› +3. **å›žę»šč‡Ŗčŗ«å¤±č“„**ļ¼šåŽŸå¼‚åøøé€ščæ‡ `__context__` äæē•™ļ¼Œå¹¶ęŠ› `_RollbackError` åˆ—å‡ŗęÆäøŖå­—ę®µēš„å›žę»šå¤±č“„ę˜Žē»†ā€”ā€”å›žę»šę˜Æ best-effortļ¼Œäø€äøŖå­—ę®µå¤±č“„äøä¼šč·³čæ‡åŽē»­ +4. **å›žč°ƒęŗäøå›žę»š**ļ¼šę–‡ä»¶ęŗå†™ęˆåŠŸåŽå†ä¾ę¬”č·‘å›žč°ƒęŗļ¼›å›žč°ƒęŗå¤±č“„ę—¶ļ¼Œę–‡ä»¶ęŗå›žę»š baselineļ¼Œä½†**å›žč°ƒęŗč‡Ŗčŗ«äøå›žę»š**ļ¼ˆå¹‚ē­‰ę€§ē”±č°ƒē”Øę–¹č“Ÿč“£ļ¼‰ + +**`write_all` ēš„ keys 栔验**ļ¼šä¼ å…„ `prompts` ēš„ key é›†åˆåæ…é”»**ē²¾ē”®ē­‰äŗŽ**ę³Øå†Œēš„å­—ę®µåé›†åˆļ¼Œå¦åˆ™ęŠ› `ValueError`怂 + +### 7.3 `optimizer.json` é…ē½®é”¹č”Ø + +源码:`trpc_agent_sdk/evaluation/_optimize_config.py`怂pydantic schema,**ę”ÆęŒ camelCase 和 snake_case äø¤ē§ key**ć€‚é”¶å±‚ē»“ęž„ļ¼š + +```jsonc +{ + "evaluate": { ... }, // čÆ„ęµ‹ę®µļ¼ˆäøŽ AgentEvaluator 同 schema) + "optimize": { // ä¼˜åŒ–å™Øę®µ + "eval_case_parallelism": 4, + "stop": { ... }, // ę”†ęž¶ēŗ§åœęœŗ + "algorithm": { ... } // ē®—ę³•å—ļ¼ˆå« reflection_lm) + } +} +``` + +#### 7.3.1 `evaluate` ꮵ + +源码:`_eval_config.py:EvalConfig`怂 + +| 字段 | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `metrics` | **ę”ä»¶åæ…å”«**ļ¼ˆč§äø‹ļ¼‰ | `Optional[list[dict]]` | `None` | metric ę•°ē»„ļ¼ŒęÆę”å« `metric_name` / `threshold` / `criterion`怂**é…äŗ† `metrics` 就忽畄 `criteria`** | +| `criteria` | **ę”ä»¶åæ…å”«**ļ¼ˆč§äø‹ļ¼‰ | `dict[str, Any]` | `{}` | ę—§å¼ē®€å†™ļ¼š`metric_name → threshold` ꈖ `{threshold, criterion}` | +| `num_runs` | 选唫 | `int` | `1` | ęÆę” case č·‘å‡ ę¬”å–å‡å€¼ļ¼ˆę¶ˆé™¤ LLM 输出方差);`≄ 2` ęŽØč | +| `user_simulator_config` | 选唫 | `Optional[Any]` | `None` | ē”Øęˆ·ęØ”ę‹Ÿå™Øé…ē½®ļ¼ˆå¤šč½®åœŗę™Æļ¼›å°‘ē”Øļ¼‰ | + +**ę”ä»¶**:`metrics` äøŽ `criteria` **č‡³å°‘é… 1 äøŖ**ā€”ā€”äø¤č€…éƒ½äøŗē©ŗę—¶ `evaluate.get_eval_metrics()` čæ”å›žē©ŗåˆ—č”Øļ¼ŒåÆåŠØęœŸä¼šå› ę²”ęœ‰ metric ęŠ„é”™ć€‚ę–°ęŽ„å…„ęŽØčē”Ø `metrics`ļ¼ˆę›“ē»“ęž„åŒ–ļ¼‰ļ¼Œ`criteria` äø»č¦äøŗå…¼å®¹ę—§é…ē½®äæē•™ć€‚ + +#### 7.3.2 `optimize` ꮵ + +源码:`_optimize_config.py:OptimizeConfig`怂 + +| 字段 | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `eval_case_parallelism` | 选唫 | `int` | `4` | åŒäø€č½®å†… case å¹¶å‘ę•°ļ¼ˆäøå½±å“ę€»č°ƒē”Øé‡ć€å½±å“ēž¬ę—¶ QPS) | +| `stop` | 选唫 | `FrameworkStopConfig` | `{required_metrics: "all"}` | ę”†ęž¶ēŗ§åœęœŗę®µļ¼ˆčÆ¦č§ [§7.3.5](#735-optimizestop-ꮵ)) | +| `algorithm` | **必唫** | `GepaReflectiveAlgo` | — | ē®—ę³•å—ļ¼ˆčÆ¦č§ [§7.3.3](#733-optimizealgorithm-ꮵ)) | + +#### 7.3.3 `optimize.algorithm` ꮵ + +源码:`_optimize_config.py:GepaReflectiveAlgo`怂`gepa_reflective` ē®—ę³•ēš„ę‰€ęœ‰åÆč°ƒå‚ę•°ć€‚ + +> **ē”¬ēŗ¦ęŸ**:蔨中**ęœ€åŽ 6 锹 stopper å­—ę®µäø­č‡³å°‘é… 1 äøŖ**ā€”ā€”å…ØéƒØē•™ē©ŗļ¼ˆé»˜č®¤ `None`ļ¼‰ä¼šč¢« `_require_at_least_one_stop_condition` ę‹’ē»ć€ęŠ› `ValueError` fail-fastć€‚čæ™å°±ę˜ÆęŠŠå®ƒä»¬ę ‡äøŗ"ę”ä»¶åæ…å”«"ēš„åŽŸå› ć€‚ + +**åŸŗē”€å­—ę®µ**: + +| 字段 | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `name` | **必唫** | `Literal["gepa_reflective"]` | — | ē®—ę³•é€‰ę‹©å™Øļ¼›ē›®å‰å”Æäø€åÆé€‰å€¼ | +| `reflection_lm` | **必唫** | `OptimizeModelOptions` | — | åę€ LM é…ē½®ļ¼ˆčÆ¦č§ [§7.3.4](#734-optimizealgorithmreflection_lm-ꮵ)) | +| `seed` | 选唫 | `int` | `42` | éšęœŗē§å­ļ¼›A/B å®žéŖŒę—¶äø¤å„—é…ē½®åŗ”äæęŒäø€č‡“ | + +**ęœē“¢č”Œäøŗå­—ę®µ**: + +| 字段 | 必唫 | ē±»åž‹ | 默认 | å–å€¼äøŽčÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `candidate_selection_strategy` | 选唫 | Literal | `"pareto"` | `pareto` ä»Žå‰ę²æęŒ‘ļ¼ˆé»˜č®¤ęŽØčļ¼‰/ `current_best` ē”Øå½“å‰ęœ€ä¼˜ / `epsilon_greedy` ęŽ¢ē“¢-åˆ©ē”Ø / `top_k_pareto` å‰ę²æå‰ K åäø­éšęœŗ | +| `module_selector` | 选唫 | `str` | `"round_robin"` | å¤šå­—ę®µę—¶ęœ¬č½®ę”¹å“ŖäøŖļ¼š`round_robin` ęŒ‰ę³Øå†Œé”ŗåŗč½®ę¢ / `all` 全选 / `random` éšęœŗ | +| `frontier_type` | 选唫 | Literal | `"instance"` | Pareto å‰ę²æē²’åŗ¦ļ¼š`instance` ęÆ case 一个 best / `objective` ęÆ metric 一个 / `hybrid` 双层 / `cartesian` ē¬›å”å°”ē§Æ | +| `reflection_minibatch_size` | 选唫 | `Optional[int]` | `None` | ęÆč½®åę€ minibatch å¤§å°ļ¼›`None` 让 gepa 决定 | +| `reflection_history_top_k` | 选唫 | `int` (0~5) | `2` | ęÆę” case ē»™åę€ LM å¤šå°‘ę”åŽ†å²ęœ€ä½³ responseļ¼›0 ē¦ē”Øļ¼ŒäøŠé™ 5 | +| `perfect_score` | 选唫 | `float` | `1.0` | "å®Œē¾Žåˆ†"é˜ˆå€¼ļ¼ˆę­é… `skip_perfect_score`) | +| `skip_perfect_score` | 选唫 | `bool` | `True` | åę€ę—¶č·³čæ‡å·²ę»”åˆ†ēš„ case | + +**å¤šå­—ę®µčžåˆļ¼ˆmerge)字段**: + +| 字段 | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `use_merge` | 选唫 | `bool` | `False` | 启用 merge roundļ¼›**ä»…å¤šå­—ę®µļ¼ˆā‰„2ļ¼‰ę—¶ēœŸč§¦å‘**ļ¼Œå•å­—ę®µę—¶äøä¼šč§¦å‘ä¹Ÿäøä¼šęŠ„é”™ļ¼ˆä»… UserWarning) | +| `max_merge_invocations` | 选唫 | `int` | `5` | merge č§¦å‘ę¬”ę•°äøŠé™ | +| `merge_val_overlap_floor` | 选唫 | `int` | `5` | č§¦å‘ merge ēš„ęœ€ä½Ž val 集 case é‡å ę•° | + +**ę€§čƒ½å­—ę®µ**: + +| 字段 | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `cache_evaluation` | 选唫 | `bool` | `False` | ē¼“å­˜ (candidate, case) čÆ„åˆ†ļ¼›é‡å¤čÆ„ä¼°ē›“ęŽ„č·³ | +| `track_best_outputs` | 选唫 | `bool` | `False` | čæ½čøŖęÆ case ēš„ęœ€ä½³č¾“å‡ŗ | + +**åœęœŗę”ä»¶ 6 锹**——**č‡³å°‘é… 1 äøŖ**(OR čÆ­ä¹‰č§¦å‘ļ¼‰ļ¼š + +| 字段 | 必唫 | ē±»åž‹ | 默认 | ęŠ¢é—øę”ä»¶ | +| --- | --- | --- | --- | --- | +| `max_metric_calls` | ę”ä»¶åæ…å”« | `Optional[int]` | `None` | 瓯讔 case-level 评估欔数 ≄ N → 停 | +| `max_iterations_without_improvement` | ę”ä»¶åæ…å”« | `Optional[int]` | `None` | čæžē»­ N č½® best valset ę— ęå‡ → 停 | +| `timeout_seconds` | ę”ä»¶åæ…å”« | `Optional[float]` | `None` | wall-clock 超过 N ē§’ → 停 | +| `score_threshold` | ę”ä»¶åæ…å”« | `Optional[float]` | `None` | best valset åˆ†ę•° ≄ N → 停 | +| `max_candidate_proposals` | ę”ä»¶åæ…å”« | `Optional[int]` | `None` | å€™é€‰ęč®®ę¬”ę•° ≄ N → 停 | +| `max_tracked_candidates` | ę”ä»¶åæ…å”« | `Optional[int]` | `None` | Pareto å€™é€‰ę± å¤§å° ≄ N → 停 | + +**ę”ä»¶**:6 é”¹é‡Œč‡³å°‘é… 1 äøŖéž `None`ļ¼Œå¦åˆ™åÆåŠØęœŸ fail-fast。详见 [§4.7 SLO ē”¬ēŗ¦ęŸ](#47)怂 + +#### 7.3.4 `optimize.algorithm.reflection_lm` ꮵ + +源码:`_optimize_model_options.py:OptimizeModelOptions`ć€‚åę€ LM ēš„čæžęŽ„é…ē½®ć€‚ + +> **ę—„åøøåŖéœ€é… 4 äøŖ**:`model_name` / `base_url` / `api_key` / `generation_config`ļ¼ˆå…¶ä½™ē•™é»˜č®¤ļ¼‰ć€‚äø‹č”Øäø­ę ‡"高阶"ēš„ 6 é”¹äø€čˆ¬äøéœ€č¦ē¢°ć€‚ + +| 字段 | 必唫 | ē±»åž‹ | 默认 | čÆ“ę˜Ž | +| --- | --- | --- | --- | --- | +| `model_name` | **必唫** | `str` | `""` | ęØ”åž‹åļ¼ˆå¦‚ `"gpt-4o-mini"`ļ¼‰ļ¼›ē©ŗå­—ē¬¦äø²ē­‰äŗŽę²”é…ļ¼Œä¼šåÆåŠØęœŸęŠ„é”™ | +| `base_url` | 选唫 | `Optional[str]` | `None` | č‡Ŗå®šä¹‰ endpoint URL | +| `api_key` | 选唫 | `str` | `""` | API keyļ¼ˆå¤šę•° provider åæ…é”»ē»™ļ¼Œå¦åˆ™č°ƒē”Øé˜¶ę®µä¼šęŠ„é”™ļ¼‰ | +| `generation_config` | 选唫 | `Optional[dict]` | `None` | ē”Ÿęˆå‚ę•°ļ¼›å…øåž‹ļ¼š`{"max_tokens": 4096, "temperature": 0.6}` | +| `provider_name` | 高阶 | `str` | `""` | provider åļ¼›ē©ŗ / `"openai"` čµ° `OpenAIModel`ļ¼Œå…¶ä»–å€¼čµ° `ModelRegistry.create_model("{provider}/{model}")` | +| `variant` | 高阶 | `str` | `""` | OpenAI å…¼å®¹å˜ä½“ļ¼ˆä»… provider 是 openai 时) | +| `extra_fields` | 高阶 | `Optional[dict]` | `None` | é€ä¼ ē»™åŗ•å±‚ model ēš„é¢å¤–å­—ę®µ | +| `num_samples` | 高阶 | `Optional[int]` | `None` | 采꠷ꕰ | +| `weight` | 高阶 | `float` | `1.0` | ęƒé‡ļ¼ˆmulti-judge åœŗę™Æļ¼‰ | +| `think` | 高阶 | `Optional[bool]` | `None` | ę˜Æå¦åÆē”Ø thinking ęØ”å¼ | + +**å­—ę®µå€¼ę”ÆęŒēŽÆå¢ƒå˜é‡å±•å¼€**——`"${TRPC_AGENT_API_KEY}"` ä¼šč¢«č‡ŖåŠØę›æę¢ć€‚ + +#### 7.3.5 `optimize.stop` ꮵ + +源码:`_optimize_config.py:FrameworkStopConfig`怂 + +| 字段 | 必唫 | ē±»åž‹ | 默认 | 取值 | +| --- | --- | --- | --- | --- | +| `required_metrics` | 选唫 | `Optional[Union[Literal["all"], list[str]]]` | `"all"` | `"all"`ļ¼šę‰€ęœ‰ metric 都要达 thresholdļ¼›`["m1", "m2"]`ļ¼šåˆ—å‡ŗēš„ metric 都要达 thresholdļ¼ˆå…¶ä»– metric ä»å‚äøŽčÆ„ęµ‹ä½†äøå½±å“ę—©åœļ¼‰ļ¼›`null` ꈖ `[]`ļ¼šē¦ē”Øę”†ęž¶ēŗ§ę—©åœļ¼ˆä»…é  algorithm ēŗ§ stopper) | + +**åˆ—č”Øå½¢å¼ę ”éŖŒ**ļ¼šåˆ—č”Øäø­ēš„ metric ååæ…é”»čƒ½åœØ `evaluate.metrics[]` äø­ę‰¾åˆ°ļ¼Œå¦åˆ™åÆåŠØęœŸ `OptimizeConfigFile._validate_required_metrics_against_evaluate` ꊛ `ValueError`ļ¼Œé”™čÆÆäæ”ęÆåˆ—å‡ŗ"未矄 metric"和"åÆē”Ø metric"ęø…å•ć€‚ + +### 7.4 `OptimizeResult` + `RoundRecord` 字段蔨 + +源码:`trpc_agent_sdk/evaluation/_optimize_result.py`ć€‚čæ™ę˜Æ `optimize()` ēš„čæ”å›žå€¼ļ¼Œä¹Ÿę˜Æ `runs//result.json` ēš„å†…å®¹ć€‚ + +> **é‡č¦ēŗ¦å®š**:`OptimizeResult` äøŽ `RoundRecord` éƒ½åŸŗäŗŽ `EvalBaseModel`(`alias_generator=to_camel`)。**Python å†…å­˜äø­ę˜Æ snake_caseļ¼Œåŗåˆ—åŒ–åˆ° JSON ę—¶å…ØéƒØč½¬ camelCase**——读 `result.json` ę—¶ęŒ‰ camelCase ē“¢å¼•ļ¼ˆ`bestPassRate` 而非 `best_pass_rate`ļ¼‰ļ¼Œåøøč§čø©å‘ē‚¹ć€‚äø‹č”Øäø­"字段"åˆ—ē”Ø Python åļ¼ˆsnake_caseļ¼‰ļ¼ŒčÆ» JSON ę—¶ę¢ęˆ camelCase怂 + +#### 7.4.1 `OptimizeResult` 锶层字段 + +**ę øåæƒē»“ęžœå­—ę®µ**: + +| å­—ę®µļ¼ˆsnake_case) | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `status` | `Literal["SUCCEEDED", "FAILED", "CANCELED"]` | ęœ€ē»ˆēŠ¶ę€ļ¼›`FAILED` ę—¶ `best_prompts = baseline_prompts` | +| `finish_reason` | Literal | `completed` / `perfect_pass_rate` / `no_improvement` / `error` | +| `stop_reason` | `Optional[StopReason]` | å“Ŗę” stopper ęŠ¢é—øļ¼ˆčÆ¦č§ [§5.4](#54-åœęœŗę—¶ęœŗå®Œęˆå½“å‰č½®å†åœ));FAILED ę—©åœę—¶äøŗ `None` | +| `error_message` | `str` | FAILED ę—¶ēš„é”™čÆÆäæ”ęÆļ¼ˆé»˜č®¤ `""`) | +| `algorithm` | `str` | ē®—ę³•åļ¼ˆå¦‚ `"gepa_reflective"`) | + +**åˆ†ę•°å­—ę®µ**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `baseline_pass_rate` | `float` | baseline 在 valset ēš„ pass rate | +| `best_pass_rate` | `float` | ęœ€ä¼˜å€™é€‰åœØ valset ēš„ pass rate | +| `pass_rate_improvement` | `float` | `best - baseline` | +| `baseline_metric_breakdown` | `dict[str, float]` | baseline ęÆę” metric 均分 | +| `best_metric_breakdown` | `dict[str, float]` | ęœ€ä¼˜å€™é€‰ęÆę” metric 均分 | +| `metric_thresholds` | `dict[str, float]` | ęÆę” metric ēš„ thresholdļ¼ˆę‹·č‡Ŗ `evaluate.metrics[].threshold`) | +| `per_metric_best_candidates` | `dict[str, list[int]]` | ęÆę” metric ēš„ Pareto å‰ę²æå€™é€‰ē“¢å¼•ļ¼ˆ0-based);空 = ē®—ę³•äøęš“éœ²ę­¤äæ”ęÆ | + +**prompt 字段**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `baseline_prompts` | `dict[str, str]` | 起点 prompt å†…å®¹ļ¼ˆęŒ‰ TargetPrompt å­—ę®µå keyed) | +| `best_prompts` | `dict[str, str]` | ęœ€ä¼˜å€™é€‰ promptļ¼›FAILED ę—¶ = `baseline_prompts`ļ¼ˆäæčÆäŗ§ē‰©**ę°øčæœäøä¼šęÆ” baseline å·®**) | + +**轮欔字段**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `total_rounds` | `int` | 跑了几轮 | +| `rounds` | `list[RoundRecord]` | ęÆč½®č®°å½•ļ¼ˆčÆ¦č§ §7.4.2) | + +**ē»Ÿč®”äøŽę—¶é—“å­—ę®µ**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `total_reflection_lm_calls` | `int` | åę€ LM ē“Æč®”č°ƒē”Øę¬”ę•°ļ¼ˆå«é‡čÆ•ļ¼‰ | +| `total_token_usage` | `dict[str, int]` | åę€ LM 瓯讔 token:`{prompt, completion, total}` | +| `duration_seconds` | `float` | ꀻ wall-clock 耗ꗶ | +| `started_at` / `finished_at` | `str` | ISO-8601 ę—¶é—“ęˆ³ | + +**其他**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `schema_version` | `str` | 默认 `"v1"`;产物 schema å‡ēŗ§ę—¶ bump | +| `extras` | `dict[str, Any]` | č‡Ŗå®šä¹‰äøšåŠ”å­—ę®µļ¼›ä¼˜åŒ–å™ØäøčÆ»äøå†™ | + +#### 7.4.2 `RoundRecord` å­—ę®µļ¼ˆęÆč½®äø€ę”ļ¼‰ + +**轮欔基本俔息**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `round` | `int` | 1-based č½®ę¬”å· | +| `kind` | `Literal["reflective", "merge"]` | åę€č½® / čžåˆč½® | +| `started_at` | `str` | ISO-8601 ę—¶é—“ęˆ³ | +| `duration_seconds` | `float` | 本轮 wall-clock 耗ꗶ | + +**ę”¹å†™ęƒ…å†µ**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `optimized_field_names` | `list[str]` | ęœ¬č½®č¢«åę€ LM ę”¹å†™ēš„å­—ę®µå | +| `candidate_prompts` | `dict[str, str]` | ęœ¬č½®å€™é€‰ēš„å…Øå­—ę®µå†…å®¹ | +| `accepted` | `bool` | ę˜Æå¦č¢«é‡‡ēŗ³äøŗę–° best | +| `acceptance_reason` | `str` | é‡‡ēŗ³å†³ē­–ēš„åÆčÆ»čÆ“ę˜Ž | +| `per_field_diagnosis` | `dict[str, str]` | åę€ LM ē»™ęÆäøŖå­—ę®µēš„čÆŠę–­ę–‡ęœ¬ | + +**čÆ„åˆ†ęƒ…å†µ**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `validation_pass_rate` | `float` | 本轮在 valset ēš„ pass rate | +| `metric_breakdown` | `dict[str, float]` | 本轮 valset äøŠęÆę” metric å‡åˆ†ļ¼›ē©ŗ = 评轮沔跑 valset | +| `failed_case_ids` | `list[str]` | 本轮 valset å¤±č“„ēš„ case id | +| `failed_cases_truncated` | `int` | 因 token é¢„ē®—č¢«č£ęŽ‰ēš„å¤±č“„ case ꕰ | +| `train_minibatch_size` | `int` | 本轮 minibatch å¤§å°ļ¼›0 = č·³čæ‡ęœŖęŠ½ę · | +| `train_subsample_parent_score` | `Optional[float]` | parent å€™é€‰åœØ minibatch äøŠēš„åˆ†ļ¼›`None` = ęœŖč·‘ | +| `train_subsample_candidate_score` | `Optional[float]` | ę–°å€™é€‰åœØ minibatch äøŠēš„åˆ†ļ¼›`None` = ęœŖč·‘ | +| `skip_reason` | `Optional[str]` | č·³čæ‡åŽŸå› ļ¼ˆå¦‚ `"subsample perfect"`态`"no proposal"`) | +| `error_message` | `Optional[str]` | ęœ¬č½®ē®—ę³•é”™čÆÆäæ”ęÆ | + +**ē»Ÿč®”å­—ę®µ**: + +| 字段 | ē±»åž‹ | 含义 | +| --- | --- | --- | +| `reflection_lm_calls` | `int` | ęœ¬č½®åę€ LM č°ƒē”Øę¬”ę•°ļ¼ˆå«é‡čÆ•ļ¼‰ | +| `round_token_usage` | `dict[str, int]` | ęœ¬č½®åę€ LM token:`{prompt, completion, total}` | +| `budget_used` | `Optional[int]` | 瓯讔已用 metric_calls | +| `budget_total` | `Optional[int]` | é…ē½®é¢„ē®—äøŠé™ļ¼ˆå¦‚ `max_metric_calls`) | + +**`extras`**(`dict[str, Any]`ļ¼‰ļ¼šč‡Ŗå®šä¹‰äøšåŠ”å­—ę®µļ¼›ä¼˜åŒ–å™ØäøčÆ»äøå†™ć€‚ + +#### 7.4.3 `OptimizeResult` å®žē”Øę–¹ę³• + +| 方法 | 蔌为 | +| --- | --- | +| `dump_to(path)` | åŗåˆ—åŒ–åˆ° JSON ę–‡ä»¶ļ¼ˆ`indent=2`, `by_alias=True`) | +| `OptimizeResult.from_file(path)` | classmethodļ¼Œä»Ž JSON ååŗåˆ—åŒ– | +| `format_summary(*, output_dir, update_source)` | ē”Ÿęˆ `summary.txt` ēš„äŗŗē±»åÆčÆ»ę–‡ęœ¬ | + + + + + +## 8 äŗ§ē‰©äøŽē›®å½•ēŗ¦å®š + +ęÆč·‘äø€ę¬” `optimize()`ļ¼Œę”†ęž¶åœØ `output_dir` äø‹č½äø€ē»„å®Œę•“ēš„å®”č®”äŗ§ē‰©ć€‚ę‰€ęœ‰å†™å…„éƒ½ę˜Æ**åŽŸå­ēš„**——SIGINT / čæ›ēØ‹å“©ęŗƒéƒ½äøä¼šē•™äø‹åŠå†™ę–‡ä»¶ć€‚ + +### 8.1 ē›®å½•åøƒå±€ + +```text +runs// +ā”œā”€ā”€ result.json å®Œę•“ OptimizeResult åŗåˆ—åŒ–ļ¼ˆēØ‹åŗčÆ»å–å…„å£ļ¼‰ +ā”œā”€ā”€ summary.txt äŗŗē±»åÆčÆ»ę‘˜č¦ļ¼ˆäø€ēœ¼ēœ‹ baseline → best) +ā”œā”€ā”€ config.snapshot.json ęœ¬ę¬”č·‘ē”Øēš„ optimizer.json å®Œę•“åæ«ē…§ļ¼ˆåÆé‡ēŽ°ļ¼‰ +ā”œā”€ā”€ run.log å•č”ŒēŠ¶ę€ļ¼ŒCI č§£ęžå‹å„½ +│ +ā”œā”€ā”€ baseline_prompts/ čæč”Œå‰ēš„ prompt åæ«ē…§ļ¼ˆęÆå­—ę®µäø€äøŖ .md) +│ ā”œā”€ā”€ system_prompt.md +│ └── ... +│ +ā”œā”€ā”€ best_prompts/ ä¼˜åŒ–å¾—åˆ°ēš„ęœ€ä¼˜å€™é€‰ļ¼ˆęÆå­—ę®µäø€äøŖ .md) +│ ā”œā”€ā”€ system_prompt.md +│ └── ... +│ +└── rounds/ ęÆč½®ēš„å®Œę•“ RoundRecord + ā”œā”€ā”€ round_001.json + ā”œā”€ā”€ round_002.json + └── ... +``` + +ęÆäøŖę–‡ä»¶ēš„č§’č‰²ļ¼š + +| ꖇ件 / 目录 | 何时写 | å¹²ä»€ä¹ˆē”Ø | +| --- | --- | --- | +| `result.json` | ä¼˜åŒ–ē»“ęŸļ¼ˆå«å¤±č“„ļ¼‰ | ēØ‹åŗčÆ»å–ęœ€ęƒåØäŗ§ē‰©ć€‚å®Œę•“ `OptimizeResult` åŗåˆ—åŒ–ļ¼ˆčÆ¦č§ [§7.4](#74-optimizeresult--roundrecord-字段蔨))。**å­—ę®µå camelCase** | +| `summary.txt` | ä¼˜åŒ–ē»“ęŸļ¼ˆä»…ęˆåŠŸļ¼‰ | äŗŗē±»åÆčÆ»ę‘˜č¦ļ¼š`baseline → best` č¶‹åŠæć€metric breakdown态ꉀ꜉ best 字段 + 字符数、artifacts 目录瓢引 | +| `config.snapshot.json` | 优化开始 | ęœ¬ę¬”č·‘ē”Øēš„ `optimizer.json` å®Œę•“åæ«ē…§ā€”ā€”åŽē»­ęƒ³"å¤č·‘čæ™ę¬”ē»“ęžœ"ē›“ęŽ„ē”Øå®ƒ | +| `run.log` | ä¼˜åŒ–ē»“ęŸ | å•č”Œļ¼š` status=... algorithm=... baseline=0.4 best=0.85 delta=+0.45 rounds=10 duration_seconds=120.5`ļ¼›CI 平台 grep 友儽 | +| `baseline_prompts/.md` | 优化开始 | čæč”Œå‰ęÆäøŖ TargetPrompt å­—ę®µēš„å†…å®¹åæ«ē…§ā€”ā€”**无论 `update_source` č®¾ä»€ä¹ˆéƒ½ä¼šå†™**ļ¼ˆęœ€é‡č¦ēš„å…œåŗ•äŗ§ē‰©ļ¼‰ | +| `best_prompts/.md` | ä¼˜åŒ–ē»“ęŸļ¼ˆä»…ęœ‰ result 时) | ęœ€ä¼˜å€™é€‰ prompt——`update_source=False` ę—¶čæ™ę˜Æęœ€ęœ‰ä»·å€¼ēš„äŗ§ē‰©ļ¼ˆå¾…äŗŗå·„ review åŽåŒę­„ļ¼‰ | +| `rounds/round_.json` | ęÆč½®ē»“ęŸ | å®Œę•“ `RoundRecord` åŗåˆ—åŒ–ļ¼ˆčÆ¦č§ [§7.4.2](#742-roundrecord-å­—ę®µęÆč½®äø€ę”));3 ä½é›¶å”«å……ē¼–å·ä¾æäŗŽęŽ’åŗ | + +### 8.2 å“Øå…µę–‡ä»¶ļ¼šč®©ē”Øęˆ·äø»åŠØåœä¼˜åŒ– + +源码:`_optimize_gepa_reflective.py:_build_stop_callbacks` ęœ«å°¾ć€‚ + +č·‘ä¼˜åŒ–ęœŸé—“ļ¼Œē”Øęˆ·åœØ `output_dir` äø‹**ę‰‹åŠØ `touch optimize.stop`**: + +```bash +touch runs//optimize.stop +``` + +äø‹äø€č½®å¼€å¤“ę”†ęž¶ę£€ęµ‹åˆ°čÆ„ę–‡ä»¶å³åœļ¼ˆ`gepa.utils.FileStopper` å®žēŽ°ļ¼‰ļ¼Œ`stop_reason="user_requested_stop"`怂**å…øåž‹ē”Øé€”**ļ¼šč·‘äŗ†äø€åŠå‘ēŽ°å·²ē»å¤Ÿē”Ø / äø“ę—¶č¦é‡Šę”¾ LLM é…é¢ā€”ā€”ęÆ” Ctrl+C ę›“ä¼˜é›…ļ¼Œčƒ½äæčÆå½“å‰č½®å®ŒęˆåŽå¹²å‡€č½ē›˜ć€‚ + +### 8.3 åŽŸå­č½ē›˜äæčÆ + +**ę‰€ęœ‰äŗ§ē‰©éƒ½ē”Ø tmp + `os.replace` 原子写**——POSIX äæčÆ rename åŽŸå­ļ¼Œčæ›ēØ‹č¢« kill / 断电时 `output_dir` é‡Œč¦ä¹ˆę˜Æå¹²å‡€ēš„ę—§ę–‡ä»¶ć€č¦ä¹ˆę˜Æå¹²å‡€ēš„ę–°ę–‡ä»¶ļ¼Œ**ę°øčæœäøä¼šå‡ŗēŽ°åŠå†™ēŠ¶ę€**怂 + +源码:`_agent_optimizer.py` ēš„äø¤äøŖå·„å…·å‡½ę•°ļ¼š +- `_atomic_write_text(path, content)`ļ¼šå…ˆå†™ `.tmp`ļ¼Œå† `os.replace(tmp, path)` +- `_mask_sigint`ļ¼šäøŠäø‹ę–‡ē®”ē†å™Øļ¼ŒåœØ `_persist_artifacts` ęœŸé—“å±č”½ SIGINTļ¼ˆéæå…"第二欔 Ctrl+C ꉓꖭ finally č½ē›˜") + +**`update_source=True` ēš„ęŗ prompt ę–‡ä»¶å›žå†™**ļ¼šä½æē”Ø `TargetPrompt.write_all`,对**å¤šå­—ę®µ**ä¹ŸäæčÆåŽŸå­ę€§ā€”ā€”ä»»äø€å­—ę®µå†™å¤±č“„ļ¼Œå·²å†™ęˆåŠŸēš„å­—ę®µå…ØéƒØå›žę»šåˆ° pre-call å†…å®¹ļ¼ˆčÆ¦č§ [§7.2](#72-targetprompt-api-蔨) ēš„ `write_all` 儑约)。 + +> **ęžē«Æå®¹é”™**ļ¼šå¦‚ęžœ `update_source=True` 写源文件时 `os.replace` č‡Ŗčŗ«å¤±č“„ļ¼ˆå¦‚ē›®ę ‡ę–‡ä»¶ę‰€åœØē›®å½•č¢«å¹¶å‘åˆ é™¤ļ¼‰ļ¼Œę”†ęž¶ä¼š**ę˜¾å¼č°ƒ `write_all(baseline)` ęŠŠęŗę–‡ä»¶ę¢å¤åˆ°čæč”Œå‰å†…å®¹**ļ¼Œå†äøŠęŠ›åŽŸå¼‚åøøā€”ā€”äæčÆäøšåŠ”ę°øčæœę‹æäøåˆ°äø€äøŖ"半优化"ēš„ęŗę–‡ä»¶ć€‚ + + +## 9 ęƒ³č‡Ŗå·±ę‰©å±•ļ¼Ÿ + +ęŗē ę€»å…„å£ļ¼š`_optimize_registrations.py`ć€‚ę”†ęž¶é€ščæ‡**ę³Øå†Œč”Øęœŗåˆ¶**ę”ÆęŒäø‰ē±»ę‰©å±•ļ¼Œäøéœ€č¦ fork SDK怂 + +### 9.1 ę³Øå†Œę–°ē®—ę³• + +源码:`_base_optimizer.py:BaseOptimizer` + `_optimize_registry.py:OPTIMIZER_REGISTRY`怂 + +写一个 `BaseOptimizer` å­ē±»ļ¼Œå®žēŽ° `async def run(self, *, reporter=None) -> OptimizeResult`ļ¼Œę³Øå†Œåˆ° `OPTIMIZER_REGISTRY`: + +```python +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + +class MyOwnOptimizer(BaseOptimizer): + async def run(self, *, reporter=None) -> OptimizeResult: + # ä½ ēš„ē®—ę³•äø»å¾ŖēŽÆć€‚åŸŗē±»å·²ę³Øå…„ļ¼š + # self.config - OptimizeConfigFile(含 evaluate / optimize 两段) + # self.call_agent - 业劔 agent é€‚é…å‡½ę•° + # self.target_prompt - TargetPrompt 实例 + # self.train_dataset_path / self.validation_dataset_path + # self.callbacks / self.output_dir + # self.extra_stop_callbacks / self.extra_gepa_callbacks + ... + return OptimizeResult(...) + + +# ę³Øå†Œļ¼šē¬¬äŗŒäøŖå‚ę•°åæ…é”»ę˜Æ BaseOptimizer å­ē±»ļ¼Œå¦åˆ™ register() ꊛ TypeError +OPTIMIZER_REGISTRY.register("my_own_algo", MyOwnOptimizer) +``` + +äøšåŠ”ä¾§ē”Øę³•ļ¼šåœØ `optimizer.json` 中把 `optimize.algorithm.name` 改为 `"my_own_algo"`ļ¼Œę”†ęž¶åÆåŠØęœŸé€ščæ‡ `OPTIMIZER_REGISTRY.get(...)` ęŸ„åˆ°ä½ ēš„ē±»ć€å®žä¾‹åŒ–ć€č·‘ `run()`怂 + +**ę³Øę„**:`GepaReflectiveAlgo.name` å½“å‰ę˜Æ `Literal["gepa_reflective"]`——**ę–°ē®—ę³•éœ€č¦ę–°åŠ äø€äøŖ `pydantic.BaseModel` é…ē½®ē±»**ļ¼ˆå¦‚ `MyOwnAlgo`ļ¼‰ļ¼Œå¹¶äæ®ę”¹ `OptimizeConfig.algorithm` 字段为 discriminated union(详见 `_optimize_config.py:OptimizeConfig` ēš„ docstring)。 + +### 9.2 ę³Øå†Œč‡Ŗå®šä¹‰ stopper + +源码:`_agent_optimizer.py:AgentOptimizer.optimize` ēš„ `extra_stop_callbacks` å‚ę•°ć€‚ + +é€ščæ‡ `extra_stop_callbacks` åœØčæč”Œę—¶ę³Øå…„ā€”ā€”**äøéœ€č¦ę”¹é…ē½®ę–‡ä»¶**: + +```python +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _LabeledStopper + + +class MySloMonitorStopper: + """č‡Ŗå®šä¹‰ stopperļ¼šę£€ęŸ„å¤–éƒØ SLO ē›‘ęŽ§ē³»ē»Ÿļ¼Œč¶…é˜ˆå€¼å°±åœć€‚""" + + def __init__(self, slo_client): + self._slo = slo_client + self.last_triggered = False + + def __call__(self, gepa_state=None) -> bool: + if self._slo.is_p99_breached(): + self.last_triggered = True + return True + return False + + +# ē”Øę³•ļ¼š +stopper = MySloMonitorStopper(slo_client) +result = await AgentOptimizer.optimize( + ..., + extra_stop_callbacks=[ + # ꙮ通 stopper:stop_reason 显示为 "completed" + stopper, + + # ęƒ³č¦ēØ³å®šēš„ stop_reason ę ‡ē­¾ę—¶ļ¼Œē”Ø _LabeledStopper åŒ…č£…ļ¼š + # _LabeledStopper(stopper, "slo_breach"), # 但 "slo_breach" äøåœØ StopReason Literal é‡Œļ¼Œpydantic ä¼šę‹’ + ], +) +``` + +**ęŽ„å£ēŗ¦å®š**(见 `_LabeledStopper`ļ¼‰ļ¼š +- åæ…é”»ęœ‰ `__call__(self, gepa_state=None) -> bool` 方法 +- `True` å³č”Øē¤ŗåœęœŗ +- åŗ”ęœ‰ `last_triggered: bool` å±žę€§ä¾› `_classify_stop_reason` čÆ»å– + +**`stop_reason` ēš„äø¤ē§č”Œäøŗ**: +- ꙮ通 callable / č‡Ŗå®šä¹‰ē±»ļ¼šč§¦å‘ę—¶ `stop_reason` 显示为 `"completed"`(gepa äøēŸ„é“ä½ äøŗä»€ä¹ˆåœļ¼‰ +- 用 `_LabeledStopper(inner, label)` åŒ…č£…ļ¼š`label` åæ…é”»ę˜Æ `StopReason` Literal ēš„åˆę³•å€¼ļ¼ˆč§ `_optimize_result.py`ļ¼‰ļ¼›č‡Ŗå®šä¹‰ę–° label éœ€ę‰©å±• Literal ē±»åž‹ + +### 9.3 ę³Øå†Œč‡Ŗå®šä¹‰čÆ„ęµ‹ callback + +源码:`_agent_optimizer.py:AgentOptimizer.optimize` ēš„ `extra_gepa_callbacks` å‚ę•°ć€‚ + +é€ščæ‡ `extra_gepa_callbacks` ęŽ„å…„ gepa å†…éƒØäŗ‹ä»¶ā€”ā€”å…øåž‹ē”Øé€”ļ¼šč½¬å‘åˆ° dashboard / å®žę—¶ē›‘ęŽ§ metric怂 + +```python +class MyDashboardCallback: + def on_proposal_end(self, *args, **kwargs) -> None: + # 上抄到 Grafana / WandB / å†…éƒØē›‘ęŽ§ + ... + + # gepa é™é»˜åæ½ē•„ē¼ŗå¤±ēš„ę–¹ę³•ļ¼ŒęŒ‰éœ€å®žēŽ°éƒØåˆ†åč®®ę–¹ę³•å³åÆ + +result = await AgentOptimizer.optimize( + ..., + extra_gepa_callbacks=[MyDashboardCallback()], +) +``` + +**åč®®ēŗ¦ęŸ**ļ¼šęÆäøŖ callback åŗ”å®žēŽ° `gepa.core.callback.GEPACallback` åč®®äø­ēš„č‹„å¹²ę–¹ę³•ļ¼ˆ`on_iteration_start` / `on_proposal_start` / `on_proposal_end` / `on_valset_breakdown` / ...)。**gepa é™é»˜åæ½ē•„ callback ē¼ŗå¤±ēš„ę–¹ę³•**ļ¼Œę‰€ä»„äøšåŠ”åÆä»„åŖå®žēŽ°å…³åæƒēš„é‚£å‡ äøŖć€‚ + + +## 10 FAQ + +**Qļ¼šč·‘äŗ†äø€ę¬” `result.json` 里 `bestPassRate` 跟 `baselinePassRate` 一样、`accepted` å…Øę˜Æ falseā€”ā€”ę˜Æ bug å—ļ¼Ÿ** + +äøę˜Æć€‚ä¼˜åŒ–ę²”ę‰¾åˆ°ęÆ” baseline ę›“å„½ēš„å€™é€‰ā€”ā€”`status="SUCCEEDED"` + `finish_reason="no_improvement"` ę˜Æčæ™ē§ęƒ…å†µēš„å…øåž‹ē»„åˆļ¼Œ`best_prompts` ē­‰äŗŽ `baseline_prompts`ć€‚åÆčƒ½åŽŸå› ļ¼šbaseline å·²ē»å¾ˆå„½ć€`max_metric_calls` ē»™å¾—å¤Ŗå°ę²”č·‘åˆ°ę”¹čæ›ē‚¹ć€č®­ē»ƒé›†å’ŒéŖŒčÆé›†åˆ†åøƒå·®å¤Ŗå¤šć€metric å™Ŗå£°å¤Ŗå¤§ļ¼ˆå»ŗč®®ęé«˜ `num_runs`)。 + +--- + +**Q:`update_source=True` č·‘ęŒ‚äŗ†ļ¼Œęŗ prompt ę–‡ä»¶č¢«ę”¹åäŗ†å—ļ¼Ÿ** + +äøä¼šć€‚äø¤é“äæé™©ļ¼š(1) ä¼˜åŒ–å¤±č“„ļ¼ˆ`status="FAILED"`ļ¼‰ę—¶ę”†ęž¶ę ¹ęœ¬äøč°ƒ `write_all`ļ¼›(2) å³ä¾æ `write_all` č‡Ŗčŗ«å¤±č“„ļ¼Œęŗę–‡ä»¶ä¹Ÿé€ščæ‡ tmp + `os.replace` åŽŸå­å›žę»šļ¼ˆčÆ¦č§ [§8.3](#83-åŽŸå­č½ē›˜äæčÆ))。 + +--- + +**Qļ¼ščƒ½č·‘åˆ°äø€åŠę”¹ `optimizer.json` å—ļ¼Ÿ** + +äøčƒ½ć€‚`optimizer.json` åœØåÆåŠØęœŸäø€ę¬”ę€§ loadļ¼ŒåŽē»­äæ®ę”¹äøä¼šč¢«čÆ»å–ć€‚å“Øå…µę–‡ä»¶ `optimize.stop` ę˜Æå”Æäø€ę”ÆęŒēš„"čæč”Œę—¶å¹²é¢„"(详见 [§8.2](#82-å“Øå…µę–‡ä»¶č®©ē”Øęˆ·äø»åŠØåœä¼˜åŒ–))。 + +--- + +**Qļ¼šč®­ē»ƒé›†å¾ˆå°ļ¼ˆ< 5 caseļ¼‰čƒ½č·‘å—ļ¼Ÿ** + +åÆä»„ļ¼Œä½†ę•ˆęžœå·®ļ¼š(1) åę€ LM ēœ‹ēš„åé¦ˆę ·ęœ¬å¤Ŗå°‘ļ¼Œę”¹å†™ę–¹å‘äøēØ³ļ¼›(2) å°č®­ē»ƒé›†å®¹ę˜“č®© advanced é…ē½®čæ‡ę‹Ÿåˆļ¼ˆå‚č€ƒ [§4.8](#48))。建议至少 5~10 ę” caseļ¼›< 5 ę—¶č€ƒč™‘å…ˆåšę‰‹å·„č°ƒä¼˜ć€‚ + +--- + +**Q:`call_agent` å†…éƒØå‘ HTTP / RPC ę—¶ę€Žä¹ˆå¤„ē†é‡čÆ•ļ¼Ÿ** + +ē”± `call_agent` č‡Ŗå·±å¤„ē†ć€‚ę”†ęž¶äøę›æäøšåŠ”åš LLM / ęœåŠ”č°ƒē”Øå±‚ēš„é‡čÆ•ā€”ā€”č®¾č®”äøŠäæęŒ `call_agent` ę˜Æé»‘ē›’ć€‚å¦‚ęžœč°ƒē”Øå¤±č“„ļ¼Œé‚£äø€ę” case čÆ„ęµ‹åˆ†ę•°č®” 0ļ¼Œåę€ LM ä¼šēœ‹åˆ°é”™čÆÆäæ”ęÆļ¼ˆå‚č€ƒ §5.2 åę€ LM åé¦ˆē»“ęž„ļ¼‰ć€‚ + +--- + +**Q:能让多个 `optimize()` åŒę—¶č·‘ć€å…±äŗ«äø€äøŖ `output_dir` å—ļ¼Ÿ** + +äøåÆä»„ć€‚å¤šäøŖčæ›ēØ‹åŒå†™äø€äøŖ `output_dir`ļ¼ŒåŽŸå­å†™ēŗ¦ęŸäæęŠ¤å•ę–‡ä»¶äøč¢«åŠå†™ļ¼Œä½†**å¤šäøŖčæ›ēØ‹ē›øäŗ’č¦†ē›–ę–‡ä»¶**——`result.json` / `rounds/round_001.json` ē­‰ä¼šäŗ’ē›øčø©ć€‚ęÆę¬”č·‘ē”Øē‹¬ē«‹ timestamp 子目录。 + +--- + +**Qļ¼šē”Ø black-box `call_agent` ęØ”å¼ę—¶ļ¼Œčƒ½ē”Ø `tool_trajectory_avg_score` 这类 metric å—ļ¼Ÿ** + +äøčƒ½ć€‚`call_agent` é»‘ē›’ęØ”å¼ę‹æäøåˆ° session traces / tool intermediate_dataļ¼Œę”†ęž¶åÆåŠØęœŸä¼š fail-fast ę‹’ē»ļ¼ˆčÆ¦č§ [§7.1](#71-agentoptimizeroptimize-å‚ę•°č”Ø) åÆåŠØęœŸę£€ęŸ„č”Øļ¼‰ć€‚ę”¹ē”Øå“åŗ”ēŗ§ metric:`final_response_avg_score` / `llm_rubric_response` / `llm_final_response`怂 + +--- + +**Qļ¼šč·‘å®Œ `update_source=False`,源 prompt čæ˜åœØåŽŸåœ°ļ¼Œä½† `target_prompt.write_all` åœØčæ‡ēØ‹äø­ę˜Æč¢«åå¤č°ƒčæ‡ēš„ļ¼Ÿ** + +ę˜Æēš„ć€‚ä¼˜åŒ–å™Øäø»å¾ŖēŽÆęÆę¬”äŗ§ē”Ÿę–°å€™é€‰éƒ½ä¼š `write_all` ęŠŠå€™é€‰å†™åˆ° `add_path` ę³Øå†Œēš„ęŗę–‡ä»¶ā€”ā€”čæ™ę˜Æäøŗäŗ†č®©äø‹äø€ę¬” `call_agent` č°ƒē”Øčƒ½čÆ»åˆ°ę–° prompt怂**`finally` é˜¶ę®µä¼šč‡ŖåŠØ `write_all(baseline_snapshot)` ęŠŠęŗę–‡ä»¶å›žę»šåˆ° baseline 内容**(源码:`_agent_optimizer.py:optimize` ēš„ `cleanup_done` sentinel)。所仄 `update_source=False` č·‘å®ŒåŽļ¼Œęŗę–‡ä»¶**äøŽčæč”Œå‰å®Œå…Øäø€č‡“**ā€”ā€”å‰ęę˜Æ `TargetPrompt.write_all` ę²”åœØå›žę»šé˜¶ę®µęŠ›é”™ļ¼ˆęžē«Æęƒ…å†µäø‹ęŠ›é”™ę—¶ę”†ęž¶ä¼šč®° warning ä½†äøå½±å“ `result.json` / `best_prompts/` ēš„äŗ§å‡ŗļ¼‰ć€‚ + +--- + +**Qļ¼šę€Žä¹ˆ"å¤č·‘"äøŠę¬”ēš„ä¼˜åŒ–ē»“ęžœļ¼Ÿ** + +å¤č·‘ `runs//config.snapshot.json` å³åÆā€”ā€”å®ƒę˜ÆäøŠę¬”ēš„å®Œę•“é…ē½®åæ«ē…§ć€‚ä½† LLM č¾“å‡ŗå­˜åœØéšęœŗę€§ļ¼Œå³ä½æé…ē½®äø€č‡“ä¹ŸåÆčƒ½å¾—åˆ°äøåŒ best_promptsļ¼›`seed` å­—ę®µå›ŗå®ščƒ½å‡å°‘ļ¼ˆäøčƒ½ę¶ˆé™¤ļ¼‰čæ™ē§éšęœŗę€§ć€‚A/B å®žéŖŒę—¶äø€å®šč¦é” seedļ¼ˆå‚č€ƒ [§4.8](#48))。 + + diff --git a/examples/optimization/advanced_strategies/README.md b/examples/optimization/advanced_strategies/README.md new file mode 100644 index 0000000..dbb261f --- /dev/null +++ b/examples/optimization/advanced_strategies/README.md @@ -0,0 +1,206 @@ +# Advanced Strategies — GEPA é«˜é˜¶ē­–ē•„ē»„åˆ A/B 对照 + +> **é€‚ē”Øåœŗę™Æ**ļ¼šå·²ē†Ÿę‚‰ GEPA åŸŗęœ¬ęµēØ‹ļ¼ŒåøŒęœ›čæ›äø€ę­„ē†č§£ `candidate_selection_strategy` / `frontier_type` / `use_merge` / `skip_perfect_score` ē­‰é«˜é˜¶é…ē½®åœØēœŸå®žä»»åŠ”äøŠēš„č”Œäøŗå·®å¼‚ć€‚ęœ¬ example č·‘ baseline äøŽ advanced äø¤å„—é…ē½®åŽē”Ø `compare.py` č¾“å‡ŗåÆ¹ęÆ”č”Øć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` §2怂 + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +GEPA é«˜é˜¶é…ē½®å¼€å…³å¤šļ¼ŒäøšåŠ”ę–¹åøøč§å›°ęƒ‘ļ¼š + +- "打开 `use_merge=true` ēœŸēš„ä¼šč§¦å‘ merge å—ļ¼Ÿ" +- "`frontier_type` 选 `instance` 还是 `objective` åÆ¹ęˆ‘ēš„ä»»åŠ”ęœ‰ä»€ä¹ˆå½±å“ļ¼Ÿ" +- "`skip_perfect_score=true` čƒ½ēœå¤šå°‘ reflection LM č°ƒē”Øļ¼Ÿ" + +å•č·‘äø€ę¬”ä¼˜åŒ–å¾€å¾€ēœ‹äøå‡ŗå·®å¼‚ļ¼Œå› äøŗ GEPA åœØå¤šę•°ä»»åŠ”äøŠéƒ½čƒ½ę”¶ę•›åˆ°ē›øčæ‘ `best_pass_rate`ć€‚ęœ¬ example 用 A/B åÆ¹ē…§ę–¹ę³•ęš“éœ²å·®å¼‚ļ¼š + +- **ę–¹ę”ˆ A(baseline)**ļ¼šåŸŗē”€ē­–ē•„ē»„åˆ +- **ę–¹ę”ˆ B(advanced)**ļ¼šé«˜é˜¶ē­–ē•„ē»„åˆļ¼ˆ`frontier_type=objective` + `skip_perfect_score=true` + `use_merge=true`) +- **任劔设讔**ļ¼šåœ°å€č§£ęžä»»åŠ”ļ¼Œę··åˆ"å®Œę•“åœ°å€"äøŽ"ē¼ŗäæ”ęÆåœ°å€"两类 caseļ¼Œåˆ¶é€ å¤šē›®ę ‡å±€éƒØęœ€ä¼˜ē©ŗé—“ + +| 输兄 | 输出 | +| --- | --- | +| äø¤å„—äøåŒēš„ `optimizer_*.json` é…ē½® | äø¤ę¬”ē‹¬ē«‹ēš„ä¼˜åŒ–čæč”Œē»“ęžœ | +| `compare.py` č§£ęžäø¤ę¬”ēš„ `result.json` | å¤šē»“åŗ¦åÆ¹ęÆ”č”Ø | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | č‡Ŗē”±ę–‡ęœ¬åœ°å€č§£ęžäøŗäø„ę ¼ JSON `{country, city, postal_code, street}`ļ¼ˆē¼ŗäæ”ęÆå­—ę®µč¾“å‡ŗ `null`) | +| ä¼˜åŒ–ē›®ę ‡ | `agent/prompts/system.md` å•å­—ę®µ | +| č®­ē»ƒé›† | 6 ę” case:3 ę”å®Œę•“åœ°å€ + 3 ę”ē¼ŗäæ”ęÆåœ°å€ | +| éŖŒčÆé›† | 6 ę” case | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **candidate_selection_strategy** | åę€ęÆč½®é€‰å“ŖäøŖå€™é€‰ä½œäøŗäŗ²ęœ¬ēš„ē­–ē•„ć€‚åÆé€‰ `pareto` / `current_best` / `epsilon_greedy` / `top_k_pareto`怂 | +| **frontier_type** | Pareto å‰ę²æē²’åŗ¦ć€‚åÆé€‰ `instance`ļ¼ˆęŒ‰ case) / `objective`ļ¼ˆęŒ‰ metric) / `hybrid`ļ¼ˆåŒå±‚ļ¼‰ / `cartesian`ļ¼ˆęŒ‰ caseƗmetric)。 | +| **skip_perfect_score** | åę€ minibatch ęŠ½ę ·ę—¶ę˜Æå¦č·³čæ‡å·²ę»”åˆ†ēš„ case怂 | +| **predictor-level merge** | merge ę“ä½œåœØ prompt å­—ę®µå±‚é¢čæ›č”Œć€‚**éœ€č¦č‡³å°‘ 2 äøŖå­—ę®µę‰ęœ‰ę„ä¹‰**ā€”ā€”å•å­—ę®µä¼˜åŒ–äø‹ merge ę°øčæœäøä¼šč§¦å‘ć€‚ | +| **merge_val_overlap_floor** | č§¦å‘ merge ēš„ęœ€ä½Ž val 集 case é‡å ę•°ļ¼ˆé»˜č®¤ 5)。 | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 é”ŗåŗč·‘äø¤ę¬”ä¼˜åŒ– + +```bash +cd examples/optimization/advanced_strategies +python3 run_baseline.py # é…ē½® A:basic ē­–ē•„ē»„åˆ +python3 run_advanced.py # é…ē½® Bļ¼šé«˜é˜¶ē­–ē•„ē»„åˆ +``` + +ęÆę¬”čæč”Œēŗ¦ 3 åˆ†é’Ÿć€‚ + +### 3.4 输出对比蔨 + +```bash +python3 compare.py +``` + +`compare.py` č‡ŖåŠØé€‰å– `runs/` äø‹ęœ€ę–°ēš„ `baseline_*` äøŽ `advanced_*` ē›®å½•č§£ęž `result.json`ļ¼Œč¾“å‡ŗå¤šē»“åŗ¦åÆ¹ęÆ”č”Øļ¼ˆč½®ę¬”ę•°ć€ęŽ„å—ēŽ‡ć€merge č§¦å‘ę¬”ę•°ć€reflection LM č°ƒē”Øę•°ć€baseline / best pass_rate 等)。 + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +[run_baseline.py] [run_advanced.py] + │ │ + ā”œā”€ā”€ optimizer_baseline.json ā”œā”€ā”€ optimizer_advanced.json + │ instance frontier │ objective frontier + │ skip_perfect_score=false │ skip_perfect_score=true + │ use_merge=false │ use_merge=trueļ¼ˆå•å­—ę®µå®žé™…äøč§¦å‘ļ¼‰ + │ │ + └── runs/baseline_/result.json └── runs/advanced_/result.json + + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + │ python3 compare.py │ + │ _latest("baseline") │ + │ _latest("advanced") │ + │ č§£ęž result.json │ + │ 输出对比蔨 │ + ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_baseline.py` | basic é…ē½®å…„å£ | äøŽ quickstart 同 | +| `run_advanced.py` | é«˜é˜¶é…ē½®å…„å£ | č°ƒę•“ `optimizer_advanced.json` äø­ē­–ē•„ē»„åˆ | +| `compare.py` | č§£ęžäø¤ę¬” `result.json` 输出对比蔨 | 添加 / åˆ é™¤å…³ę³Øēš„åÆ¹ęÆ”ē»“åŗ¦ | +| `agent/agent.py` | åœ°å€č§£ęž LlmAgent + `_normalize_json` | ę›æę¢äøŗäøšåŠ” agent | +| `agent/prompts/system.md` | baseline promptļ¼ˆę•…ę„ęžē®€ļ¼‰ | å†™å…„äøšåŠ” baseline | +| `optimizer_baseline.json` | basic ē­–ē•„ JSON | č°ƒę•“é˜ˆå€¼äøŽ metric | +| `optimizer_advanced.json` | é«˜é˜¶ē­–ē•„ JSON | č°ƒę•“é«˜é˜¶å¼€å…³ | +| `data/train.evalset.json` / `data/val.evalset.json` | ę•°ę®é›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ | + +## 5 Ā· é«˜é˜¶ē­–ē•„åÆ¹ē…§ + +### 5.1 é…ē½®å·®å¼‚é€ŸęŸ„ + +| é…ē½®é”¹ | baseline | advanced | +| --- | --- | --- | +| `candidate_selection_strategy` | `pareto` | `pareto` | +| `frontier_type` | `instance` | `objective` | +| `skip_perfect_score` | `false` | `true` | +| `use_merge` | `false` | `true` | +| `module_selector` | `round_robin` | `round_robin` | + +### 5.2 `frontier_type` instance vs objective + +| 取值 | 蔌为 | åœØęœ¬ä»»åŠ”äøŠēš„č”ØēŽ° | +| --- | --- | --- | +| `instance` | ęÆę” case ē»“ęŠ¤äø€äøŖ best å€™é€‰ļ¼Œåę€ēœ‹é€ case 反馈 | ęŽ„å—é—Øę§›č¾ƒé«˜ļ¼ˆéœ€åœØęŸ case äøŠäø„ę ¼ä¼˜äŗŽåŽ†å²ļ¼‰ļ¼Œrounds_accepted č¾ƒå°‘ | +| `objective` | ęÆę” metric ē»“ęŠ¤äø€äøŖ bestļ¼Œåę€ēœ‹čšåˆåˆ†ę•° | ęŽ„å—é—Øę§›č¾ƒä½Žļ¼ˆčšåˆåˆ†ęœ‰ęå‡å³ęŽ„å—ļ¼‰ļ¼Œrounds_accepted č¾ƒå¤šä½† valset ę˜“éœ‡č” | + +`objective` ę›“ęæ€čæ›ć€‚å°č®­ē»ƒé›†ļ¼ˆ< 10 caseļ¼‰äø‹åÆčƒ½čæ‡ę‹Ÿåˆ train minibatchļ¼Œé€ ęˆ valset pass_rate ę³¢åŠØć€‚ + +### 5.3 `skip_perfect_score` ēš„å®žé™…čŠ‚ēœ + +ē†č®ŗäøŠčƒ½å‡å°‘äøåæ…č¦ēš„ reflection LM č°ƒē”Øć€‚å®žé™…čŠ‚ēœå¹…åŗ¦å–å†³äŗŽļ¼š + +- baseline čµ·ē‚¹é«˜åŗ¦ļ¼ˆbaseline=0 ę—¶ę—©ęœŸę»”åˆ† case ęžå°‘ļ¼ŒčŠ‚ēœęœ‰é™ļ¼‰ +- č®­ē»ƒé›†č§„ęØ”ļ¼ˆå°č®­ē»ƒé›†äø‹ę»”åˆ† case 在 minibatch äø­ęÆ”ä¾‹äøēØ³å®šļ¼‰ + +本 example å®žęµ‹ēŗ¦čŠ‚ēœ 1 ꬔ reflection č°ƒē”Øļ¼Œå·®å¼‚äøę˜¾č‘—ć€‚čÆ„å¼€å…³åœØ**å¤§č§„ęØ”č®­ē»ƒé›† + é«˜åŸŗēŗæčµ·ē‚¹**åœŗę™Æäø‹ę•ˆęžœę›“ę˜Žę˜¾ć€‚ + +## 6 Ā· å…³é”®é…ē½®ļ¼ˆå«äø¤ę”čø©å‘č­¦ē¤ŗļ¼‰ + +### 6.1 `use_merge` åœØå•å­—ę®µä¼˜åŒ–äø‹äøä¼šč§¦å‘ + +merge 是 predictor-level ę“ä½œļ¼Œ**éœ€č¦č‡³å°‘ 2 äøŖå­—ę®µę‰ęœ‰ę„ä¹‰**ć€‚ęœ¬ example ę˜Æå•å­—ę®µä¼˜åŒ–ļ¼Œå› ę­¤ `optimizer_advanced.json` äø­ `use_merge=true` č®¾ē½®ę— å‰Æä½œē”Øļ¼Œä½†ä¹Ÿäøä¼šåø¦ę„ä»»ä½•å®žé™… merge č”Œäøŗā€”ā€”`compare.py` 输出中 `merge_rounds_total=0` ę˜Æé¢„ęœŸć€‚ + +éœ€č¦č§‚åÆŸ merge å®žé™…ę•ˆęžœę—¶ļ¼Œå‚č§ `multi_agent_pipeline/` exampleļ¼Œå…¶ 4 å­—ę®µé…ē½®äø‹ merge ä¼šēœŸå®žč§¦å‘ć€‚ + +### 6.2 `result.json` å­—ę®µå‘½åäøŗ camelCase + +SDK å†…éƒØä½æē”Ø snake_case å­—ę®µåļ¼ˆå¦‚ `stop_reason` / `total_rounds` / `best_pass_rate`ļ¼‰ļ¼Œä½†åŗåˆ—åŒ–åˆ° `result.json` ę—¶ä¼šč‡ŖåŠØč½¬ę¢äøŗ camelCase(`stopReason` / `totalRounds` / `bestPassRate`)。 + +čæ™ę˜Æå› äøŗ `EvalBaseModel` ēš„ `alias_generator=to_camel`ļ¼Œåŗåˆ—åŒ–ę—¶ `by_alias=True`怂 + +**čø©å‘ęé†’**ļ¼šē”Ø Python 读 `result.json` ę—¶ęŒ‰ camelCase ē“¢å¼•ļ¼š + +```python +data = json.loads(Path("result.json").read_text()) +print(data["bestPassRate"]) # āœ… +print(data["best_pass_rate"]) # āŒ KeyError +``` + +`compare.py` äø­å·²ē»ęŒ‰ camelCase č§£ęžļ¼›č‡Ŗęœ‰č„šęœ¬čÆ» `result.json` ę—¶åŒę ·ęŒ‰ę­¤ēŗ¦å®šć€‚ + +### 6.3 `frontier_type` å–å€¼ēŗ¦ęŸ + +SDK ä»…ęŽ„å—ä»„äø‹ 4 äøŖå­—é¢é‡å€¼ļ¼š + +``` +"instance" | "objective" | "hybrid" | "cartesian" +``` + +å…¶ä»–å–å€¼ļ¼ˆå¦‚ `"aggregate"` / `"mixed"`ļ¼‰ä¼šåœØ pydantic å±‚é¢ē›“ęŽ„ `ValidationError`ļ¼Œę— ę³•åÆåŠØä¼˜åŒ–ć€‚é…ē½®å‰čÆ·ē”®č®¤ę‹¼å†™ć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Qļ¼šäøŗä»€ä¹ˆäø¤ę¬”č·‘ēš„ `best_pass_rate` ē»åøøē›øåŒļ¼Ÿ** +A:GEPA 是 Pareto ä¼˜åŒ–ē®—ę³•ļ¼ŒåœØē®€å•ä»»åŠ” + å……č¶³é¢„ē®—äø‹äø¤å„—ē­–ē•„ęœ€ē»ˆåøøę”¶ę•›åˆ°åŒäø€ęœ€ä¼˜ć€‚å·®å¼‚å¾€å¾€ä½“ēŽ°åœØ**åˆ°č¾¾č·Æå¾„**ļ¼ˆč½®ę¬”ę•°ć€ęŽ„å—ēŽ‡ć€merge č”Œäøŗļ¼‰č€Œéžęœ€ē»ˆåˆ†ę•°ć€‚čæ™ę­£ę˜Æęœ¬ example 设讔 `compare.py` å…³ę³Øå¤šē»“åŗ¦č€Œéžå•äø€ `best_pass_rate` ēš„åŽŸå› ć€‚ + +**Q:advanced ęŽ„å—äŗ† 4 轮但 baseline åŖęŽ„å—äŗ† 2 č½®ļ¼Œę˜Æäøę˜Æ advanced ę›“å„½ļ¼Ÿ** +Aļ¼šäøäø€å®šć€‚`objective` frontier ęŽ„å—é—Øę§›ä½Žļ¼ŒåÆčƒ½"ęŽ„å—äŗ†äø€äøŖ train äøŠę›“å„½ä½† val äøŠę›“å·®"ēš„å€™é€‰ć€‚éœ€ē»“åˆęÆč½®ēš„ `valset pass_rate` č¶‹åŠæč§‚åÆŸę˜Æå¦čæ‡ę‹Ÿåˆć€‚ + +**Q:`compare.py` 输出 `merge_rounds_total=0` ä½†ęˆ‘å¼€äŗ† `use_merge=true`?** +Aļ¼šå•å­—ę®µä¼˜åŒ–äø‹ē¬¦åˆé¢„ęœŸć€‚å‚č§ §6.1怂 + +**Qļ¼šę€Žä¹ˆēŸ„é“ę˜Æå“Ŗäø€č½®č¢«ęŽ„å—ēš„ć€ę˜Æåę€čæ˜ę˜Æ merge?** +A:`result.json` äø­ `rounds[*]` ę•°ē»„ęÆę”č®°å½•éƒ½ęœ‰ `accepted: true/false` 和 `kind: "reflective" | "merge"` å­—ę®µļ¼ŒåÆē›“ęŽ„éåŽ†ęŸ„ēœ‹ć€‚ + +**Q:advanced é…ē½®é‡Œ `seed` åŗ”čÆ„å’Œ baseline äæęŒäø€č‡“å—ļ¼Ÿ** +Aļ¼šäæęŒäø€č‡“ä¾æäŗŽåÆ¹ęÆ”ę—¶ęŽ’é™¤éšęœŗę€§å½±å“ć€‚ęœ¬ example 两份 JSON éƒ½ē”ØåŒäø€ `seed`怂 + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ēš„ę­„éŖ¤ + +1. **å¤åˆ¶ęœ¬ example ä½œäøŗåÆ¹ē…§ęØ”ęæ**ļ¼šäæē•™ `run_baseline.py` / `run_advanced.py` / `compare.py` äø‰č„šęœ¬ē»“ęž„ +2. **ę›æę¢äøšåŠ” agent**:`agent/agent.py` ę”¹äøŗäøšåŠ” agent å®žēŽ° +3. **č®¾č®”äø¤å„—é…ē½® JSON**: + - `optimizer_baseline.json`ļ¼šå½“å‰ēŗæäøŠé…ē½®ęˆ–é»˜č®¤é…ē½® + - `optimizer_advanced.json`ļ¼šåøŒęœ›éŖŒčÆēš„é«˜é˜¶ē»„åˆ + - äŗŒč€…äæęŒ `seed` / `max_metric_calls` 一臓仄便公平对比 +4. **ę›æę¢ę•°ę®é›†**:业劔 train / val +5. **跑两欔 + compare**ļ¼šę ¹ę®åÆ¹ęÆ”č”Øå¤šē»“åŗ¦čÆ„ä¼°é«˜é˜¶é…ē½®åœØäøšåŠ”ä»»åŠ”äøŠēš„å®žé™…ę”¶ē›Š +6. **决策**ļ¼šęŠŠåÆ¹ęÆ”č”Øäø­č”ØēŽ°ę˜Žę˜¾ę›“ä¼˜ēš„é…ē½®ä½œäøŗē”Ÿäŗ§é…ē½® + +> é«˜é˜¶é…ē½®äøę˜Æ"č¶Šå¤ę‚č¶Šå„½"ć€‚č®øå¤šä»»åŠ”äøŠ baseline é…ē½®å·²čƒ½č¾¾åˆ°åˆē†ę”¶ę•›ļ¼Œadvanced åŖåœØē‰¹å®šä»»åŠ”ē»“ęž„ļ¼ˆå¤šē›®ę ‡ć€å¤šå­—ę®µć€å¤§č§„ęØ”č®­ē»ƒé›†ē­‰ļ¼‰äø‹ę˜¾ē¤ŗä»·å€¼ć€‚**ē”Øę•°ę®å†³å®šļ¼Œäøē”Øē›“č§‰**怂 diff --git a/examples/optimization/advanced_strategies/agent/__init__.py b/examples/optimization/advanced_strategies/agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/optimization/advanced_strategies/agent/agent.py b/examples/optimization/advanced_strategies/agent/agent.py new file mode 100644 index 0000000..90d8b8c --- /dev/null +++ b/examples/optimization/advanced_strategies/agent/agent.py @@ -0,0 +1,134 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""åœ°å€č§£ęž agent —— Advanced Strategies example 专用。 + +任劔设讔动机 +------------ +本 example ē”ØäŗŽéŖŒčÆ GEPA é«˜é˜¶ē­–ē•„ē»„åˆļ¼ˆuse_merge / frontier_type / +skip_perfect_score ē­‰ļ¼‰ēš„ēœŸå®žę•ˆęžœć€‚ä»»åŠ”åæ…é”»å­˜åœØäø¤äøŖ**äŗ’ē›øē‰µåˆ¶**ēš„ē»“åŗ¦ļ¼Œ +ę‰čƒ½é€¼å‡ŗē­–ē•„å·®å¼‚ļ¼š + +A. å®Œę•“åœ°å€ļ¼ˆcountry/city/postal_code/street éƒ½ē»™åˆ°ļ¼‰ā†’ ęœŸęœ›äø„ę ¼ JSON +B. ē¼ŗäæ”ęÆåœ°å€ļ¼ˆå°‘ postal_code ꈖ street)→ ęœŸęœ›åÆ¹åŗ”å­—ę®µč¾“å‡ŗ null + +候选 prompt å®¹ę˜“é™·å…„äø¤äøŖå±€éƒØęœ€ä¼˜ļ¼š +- 候选 P1 学会"äø„ę ¼ JSON"ä½†ę‰€ęœ‰å­—ę®µéƒ½äøē»™ nullļ¼ˆē¼ŗäæ”ęÆę—¶ē”¬ē¼–äø€äøŖļ¼‰ +- 候选 P2 学会"评 null å°± null"但 JSON ę ¼å¼å¶å°”å“© + +→ å¤šå­—ę®µåœŗę™Æäø‹ use_merge=true čƒ½čžåˆ P1/P2 å„č‡ŖęŽŒę”ēš„å­čƒ½åŠ›ć€‚ +→ frontier_type 选 instance vs objective åœØčæ™ē±»ä»»åŠ”äøŠč”Œäøŗå·®å¼‚ę˜¾č‘—ć€‚ + +ęŽ„å…„äøšåŠ”ę—¶ę”¹å“Ŗé‡Œ +---------------- +- ę›æę¢äøŗäøšåŠ”ä»»åŠ” agent äøŽ prompt +- äæē•™ _normalize_json 让 metric čµ° text exact,CI äøŠę›“ēØ³ +""" + +from __future__ import annotations + +import json +import re +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" +APP_NAME = "advanced_strategies_demo" + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _create_model() -> LLMModel: + """ęž„å»ŗ OpenAI 兼容 chat ęØ”åž‹å®žä¾‹ć€‚""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """ä»Žē£ē›˜é‡čÆ» system.md怂""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def create_agent() -> LlmAgent: + """ęž„å»ŗäø€äøŖä½æē”Øå½“å‰ē£ē›˜ prompt ēš„ę–° LlmAgent å®žä¾‹ć€‚""" + return LlmAgent( + name="address_parser", + description="Parses free-text postal addresses into a strict JSON.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.1, + top_p=0.9, + max_output_tokens=256, + ), + ) + + +def _normalize_json(raw: str) -> str: + """把 LLM č¾“å‡ŗč§„čŒƒåŒ–ęˆēØ³å®š JSON 字符串。 + + äøŽ ci_integration / blackbox_cli å®Œå…Øē›øåŒēš„č§„čŒƒåŒ–é€»č¾‘ļ¼šč®© + final_response_avg_score(text.match=exact) ē›“ęŽ„čµ°ē²¾ē”®åŒ¹é…ć€‚ + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šč·‘äø€ę¬”ęŽØē†ļ¼Œč¾“å‡ŗē» _normalize_json č§„čŒƒåŒ–ć€‚""" + root = create_agent() + session_service = InMemorySessionService() + runner = Runner( + app_name=APP_NAME, + agent=root, + session_service=session_service, + ) + session_id = str(uuid.uuid4()) + user_id = "parser" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return _normalize_json(final_text) diff --git a/examples/optimization/advanced_strategies/agent/config.py b/examples/optimization/advanced_strategies/agent/config.py new file mode 100644 index 0000000..d0a64b1 --- /dev/null +++ b/examples/optimization/advanced_strategies/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpoint + TRPC_AGENT_MODEL_NAME ęØ”åž‹å + +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/advanced_strategies/agent/prompts/system.md b/examples/optimization/advanced_strategies/agent/prompts/system.md new file mode 100644 index 0000000..f9b299b --- /dev/null +++ b/examples/optimization/advanced_strategies/agent/prompts/system.md @@ -0,0 +1 @@ +You parse free-text postal addresses and return a JSON object. diff --git a/examples/optimization/advanced_strategies/compare.py b/examples/optimization/advanced_strategies/compare.py new file mode 100644 index 0000000..164b27b --- /dev/null +++ b/examples/optimization/advanced_strategies/compare.py @@ -0,0 +1,100 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""č·‘å®Œ baseline + advanced åŽē”Øęœ¬č„šęœ¬åÆ¹ęÆ” result.jsonļ¼Œč¾“å‡ŗåÆ¹ęÆ”č”Øć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +é«˜é˜¶ē­–ē•„ A/B åÆ¹ē…§å®žéŖŒēš„åˆ†ęžē«Æć€‚å…ˆč·‘ run_baseline.py + run_advanced.py, +å†č·‘ęœ¬č„šęœ¬ļ¼šč‡ŖåŠØé€‰å– runs/ äø‹ęœ€ę–°ēš„ baseline_* äøŽ advanced_* ē›®å½•č§£ęž +result.jsonļ¼ŒęŒ‰å¤šē»“åŗ¦åÆ¹ęÆ”č”Øč¾“å‡ŗć€‚ + +result.json å­—ę®µå‘½åę³Øę„ +------------------------ +SDK å†…éƒØ snake_caseļ¼Œä½†åŗåˆ—åŒ–åˆ° result.json ę—¶é€ščæ‡ alias_generator č½¬ę¢äøŗ +camelCaseć€‚ęœ¬č„šęœ¬ęŒ‰ camelCase ē“¢å¼•ļ¼ˆstopReason / totalRounds / bestPassRate +ē­‰ļ¼‰ć€‚č‡Ŗęœ‰č„šęœ¬čÆ» result.json ę—¶åŒę ·ęŒ‰ę­¤ēŗ¦å®šć€‚ +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +RUNS_DIR = _HERE / "runs" + + +def _latest(prefix: str) -> Path | None: + """ęŒ‘å‡ŗ runs/_* äø­ęœ€ę–°äø€ę¬”ēš„ē›®å½•ć€‚""" + candidates = sorted(RUNS_DIR.glob(f"{prefix}_*")) + return candidates[-1] if candidates else None + + +def _load(run_dir: Path) -> dict: + """读 result.jsonļ¼›ē¼ŗå¤±čæ”å›žē©ŗ dict怂""" + rj = run_dir / "result.json" + if not rj.exists(): + return {} + return json.loads(rj.read_text(encoding="utf-8")) + + +def _short(d: dict) -> dict: + """ä»Žå®Œę•“ result.json äø­ęŠ½å–ęœ¬ę¬”åÆ¹ęÆ”å…³ę³Øēš„ē»“åŗ¦ć€‚ + + ē»“åŗ¦é€‰ę‹©åŽŸåˆ™ļ¼ščƒ½ē›“ęŽ„åę˜ é«˜é˜¶ē­–ē•„č”Œäøŗå·®å¼‚ēš„å­—ę®µļ¼ˆč½®ę¬”ęŽ„å—ēŽ‡ć€merge + č§¦å‘ę¬”ę•°ć€reflection LM č°ƒē”Øę•°ē­‰ļ¼‰ļ¼Œč€Œéžä»…ęœ€ē»ˆ best_pass_rate + ļ¼ˆé«˜é˜¶ē­–ē•„å¾€å¾€äøŽ baseline ę”¶ę•›åˆ°ē›øčæ‘ęœ€ē»ˆåˆ†ę•°ļ¼Œå·®å¼‚åœØåˆ°č¾¾č·Æå¾„äøŠļ¼‰ć€‚ + """ + rounds = d.get("rounds") or [] + accepted = sum(1 for r in rounds if r.get("accepted")) + merge_total = sum(1 for r in rounds if r.get("kind") == "merge") + merge_accepted = sum(1 for r in rounds if r.get("kind") == "merge" and r.get("accepted")) + return { + "stop_reason": d.get("stopReason"), + "finish_reason": d.get("finishReason"), + "duration_s": round(d.get("durationSeconds") or 0.0, 1), + "total_rounds": d.get("totalRounds"), + "rounds_accepted": accepted, + "merge_rounds_total": merge_total, + "merge_rounds_accepted": merge_accepted, + "reflection_lm_calls": d.get("totalReflectionLmCalls"), + "baseline_pass_rate": d.get("baselinePassRate"), + "best_pass_rate": d.get("bestPassRate"), + } + + +def main() -> int: + """čÆ»äø¤ę¬”ęœ€ę–° runļ¼Œč¾“å‡ŗåÆ¹ęÆ”č”Øć€‚""" + base = _latest("baseline") + adv = _latest("advanced") + if base is None or adv is None: + print( + "Need both baseline_* and advanced_* runs in runs/. " + "Run run_baseline.py and run_advanced.py first.", + file=sys.stderr, + ) + return 1 + + print(f"baseline run : {base.name}") + print(f"advanced run : {adv.name}\n") + + a = _short(_load(base)) + b = _short(_load(adv)) + keys = list(a.keys()) + width = max(len(k) for k in keys) + 2 + print(f"{'metric'.ljust(width)}{'baseline'.rjust(18)}{'advanced'.rjust(18)}") + print("-" * (width + 36)) + for k in keys: + va = a.get(k) + vb = b.get(k) + print(f"{k.ljust(width)}{str(va).rjust(18)}{str(vb).rjust(18)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/optimization/advanced_strategies/data/train.evalset.json b/examples/optimization/advanced_strategies/data/train.evalset.json new file mode 100644 index 0000000..d9c85f9 --- /dev/null +++ b/examples/optimization/advanced_strategies/data/train.evalset.json @@ -0,0 +1,73 @@ +{ + "eval_set_id": "advanced_strategies_train", + "name": "Address parsing - train", + "description": "6 case 混合:3 äøŖå®Œę•“åœ°å€ļ¼ˆcountry/city/postal_code/street å…Øęœ‰ļ¼‰+ 3 äøŖē¼ŗäæ”ęÆåœ°å€ļ¼ˆéƒØåˆ†å­—ę®µéœ€č¦č¾“å‡ŗ null)。GEPA č¦åŒę—¶å­¦å„½čæ™äø¤ē±»ļ¼Œę˜Æ use_merge č§¦å‘åœŗę™Æć€‚", + "eval_cases": [ + { + "eval_id": "train_full_us", + "conversation": [ + { + "invocation_id": "t1", + "user_content": {"parts": [{"text": "1600 Amphitheatre Parkway, Mountain View, CA 94043, USA"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Mountain View\",\"country\":\"USA\",\"postal_code\":\"94043\",\"street\":\"1600 Amphitheatre Parkway\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_full_uk", + "conversation": [ + { + "invocation_id": "t2", + "user_content": {"parts": [{"text": "10 Downing Street, London SW1A 2AA, UK"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"London\",\"country\":\"UK\",\"postal_code\":\"SW1A 2AA\",\"street\":\"10 Downing Street\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_full_jp", + "conversation": [ + { + "invocation_id": "t3", + "user_content": {"parts": [{"text": "1-1 Chiyoda, Chiyoda City, Tokyo 100-8111, Japan"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Chiyoda City\",\"country\":\"Japan\",\"postal_code\":\"100-8111\",\"street\":\"1-1 Chiyoda\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_missing_postal", + "conversation": [ + { + "invocation_id": "t4", + "user_content": {"parts": [{"text": "Avenue des Champs-ƉlysĆ©es, Paris, France"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Paris\",\"country\":\"France\",\"postal_code\":null,\"street\":\"Avenue des Champs-ƉlysĆ©es\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_missing_street", + "conversation": [ + { + "invocation_id": "t5", + "user_content": {"parts": [{"text": "Berlin 10115, Germany"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Berlin\",\"country\":\"Germany\",\"postal_code\":\"10115\",\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "train_missing_both", + "conversation": [ + { + "invocation_id": "t6", + "user_content": {"parts": [{"text": "Sydney, Australia"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Sydney\",\"country\":\"Australia\",\"postal_code\":null,\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + } + ] +} diff --git a/examples/optimization/advanced_strategies/data/val.evalset.json b/examples/optimization/advanced_strategies/data/val.evalset.json new file mode 100644 index 0000000..b6c09c7 --- /dev/null +++ b/examples/optimization/advanced_strategies/data/val.evalset.json @@ -0,0 +1,73 @@ +{ + "eval_set_id": "advanced_strategies_val", + "name": "Address parsing - val", + "description": "6 case éŖŒčÆé›†ļ¼ˆ>=5 滔足 merge_val_overlap_floor é»˜č®¤ļ¼‰ļ¼š3 å®Œę•“ + 3 缺俔息。schema äøŽ train å®Œå…Øäø€č‡“ć€‚", + "eval_cases": [ + { + "eval_id": "val_full_ca", + "conversation": [ + { + "invocation_id": "v1", + "user_content": {"parts": [{"text": "100 Queen Street West, Toronto, ON M5H 2N2, Canada"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Toronto\",\"country\":\"Canada\",\"postal_code\":\"M5H 2N2\",\"street\":\"100 Queen Street West\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_full_de", + "conversation": [ + { + "invocation_id": "v2", + "user_content": {"parts": [{"text": "Marienplatz 1, Munich 80331, Germany"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Munich\",\"country\":\"Germany\",\"postal_code\":\"80331\",\"street\":\"Marienplatz 1\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_full_sg", + "conversation": [ + { + "invocation_id": "v3", + "user_content": {"parts": [{"text": "1 Marina Boulevard, Singapore 018989, Singapore"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Singapore\",\"country\":\"Singapore\",\"postal_code\":\"018989\",\"street\":\"1 Marina Boulevard\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_missing_postal", + "conversation": [ + { + "invocation_id": "v4", + "user_content": {"parts": [{"text": "Plaza Mayor, Madrid, Spain"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Madrid\",\"country\":\"Spain\",\"postal_code\":null,\"street\":\"Plaza Mayor\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_missing_street", + "conversation": [ + { + "invocation_id": "v5", + "user_content": {"parts": [{"text": "Rome 00184, Italy"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Rome\",\"country\":\"Italy\",\"postal_code\":\"00184\",\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + }, + { + "eval_id": "val_missing_both", + "conversation": [ + { + "invocation_id": "v6", + "user_content": {"parts": [{"text": "Cairo, Egypt"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"Cairo\",\"country\":\"Egypt\",\"postal_code\":null,\"street\":null}"}], "role": "model"} + } + ], + "session_input": {"app_name": "advanced_strategies_demo", "user_id": "parser", "state": {}} + } + ] +} diff --git a/examples/optimization/advanced_strategies/optimizer_advanced.json b/examples/optimization/advanced_strategies/optimizer_advanced.json new file mode 100644 index 0000000..4081e59 --- /dev/null +++ b/examples/optimization/advanced_strategies/optimizer_advanced.json @@ -0,0 +1,48 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "objective", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": true, + "perfect_score": 1.0, + "use_merge": true, + "max_merge_invocations": 3, + "merge_val_overlap_floor": 5, + "max_metric_calls": 60, + "score_threshold": 1.0, + "max_iterations_without_improvement": 4 + } + } +} diff --git a/examples/optimization/advanced_strategies/optimizer_baseline.json b/examples/optimization/advanced_strategies/optimizer_baseline.json new file mode 100644 index 0000000..e757186 --- /dev/null +++ b/examples/optimization/advanced_strategies/optimizer_baseline.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 60, + "score_threshold": 1.0, + "max_iterations_without_improvement": 4 + } + } +} diff --git a/examples/optimization/advanced_strategies/run_advanced.py b/examples/optimization/advanced_strategies/run_advanced.py new file mode 100644 index 0000000..da32d19 --- /dev/null +++ b/examples/optimization/advanced_strategies/run_advanced.py @@ -0,0 +1,70 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""é…ē½® Bļ¼šé«˜é˜¶ē­–ē•„ē»„åˆ —— frontier_type=objective + skip_perfect_score=true + +use_merge=true怂 + +é€‚ē”Øåœŗę™Æ +-------- +é«˜é˜¶ē­–ē•„ A/B åÆ¹ē…§å®žéŖŒēš„"高阶"čæč”Œć€‚äøŽ run_baseline.py å…±ē”ØåŒäø€ä»½ę•°ę®é›† +和 agentļ¼Œä»… optimizer JSON äøåŒļ¼Œä¾æäŗŽåœØå…¬å¹³ę”ä»¶äø‹č§‚åÆŸē­–ē•„å·®å¼‚ć€‚ + +é¢„ęœŸäøŽ baseline ēš„å·®å¼‚ +---------------------- +- åę€ LM č°ƒē”Øę›“ēœļ¼ˆę»”åˆ† case äøå†å–‚å›žåę€ minibatch) +- objective frontier ęŽ„å—é—Øę§›ę›“ä½Žļ¼Œrounds_accepted ę›“å¤šä½† valset ę˜“éœ‡č” +- å•å­—ę®µä¼˜åŒ–äø‹ use_merge=true äøä¼šēœŸč§¦å‘ merge(gepa 是 predictor-level + mergeļ¼Œéœ€č¦č‡³å°‘ 2 äøŖå­—ę®µę‰ęœ‰ę„ä¹‰ļ¼›čÆ¦č§ README §6.1) + +č¾“å‡ŗč½åˆ° runs/advanced_<ę—¶é—“ęˆ³>/,compare.py č‡ŖåŠØé€‰å–ęœ€ę–°äø€ę¬”åÆ¹ęÆ”ć€‚ +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.agent import SYSTEM_PROMPT_PATH, call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer_advanced.json" +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimizeļ¼ˆē”Ø advanced é…ē½®ļ¼‰ć€‚""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / f"advanced_{timestamp}" + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/advanced_strategies/run_baseline.py b/examples/optimization/advanced_strategies/run_baseline.py new file mode 100644 index 0000000..39765a6 --- /dev/null +++ b/examples/optimization/advanced_strategies/run_baseline.py @@ -0,0 +1,65 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""é…ē½® A:basic ē­–ē•„ē»„åˆ —— äøŽ quickstart å‡ ä¹Žäø€č‡“ļ¼Œä½œäøŗåÆ¹ē…§åŸŗēŗæć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +é«˜é˜¶ē­–ē•„ A/B åÆ¹ē…§å®žéŖŒēš„åŸŗēŗæčæč”Œć€‚é…åˆ run_advanced.py + compare.py ä½æē”Øļ¼š +- ęœ¬č„šęœ¬ļ¼šbasic ē­–ē•„ē»„åˆļ¼ˆpareto + instance + use_merge=false + + skip_perfect_score=false) +- run_advanced.pyļ¼šé«˜é˜¶ē­–ē•„ē»„åˆ +- compare.pyļ¼šč§£ęžäø¤ę¬” result.json 输出对比蔨 + +č¾“å‡ŗč½åˆ° runs/baseline_<ę—¶é—“ęˆ³>/,compare.py č‡ŖåŠØé€‰å–ęœ€ę–°äø€ę¬”åÆ¹ęÆ”ć€‚ +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.agent import SYSTEM_PROMPT_PATH, call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer_baseline.json" +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimizeļ¼ˆē”Ø baseline é…ē½®ļ¼‰ć€‚""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / f"baseline_{timestamp}" + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/blackbox_cli/README.md b/examples/optimization/blackbox_cli/README.md new file mode 100644 index 0000000..0b54214 --- /dev/null +++ b/examples/optimization/blackbox_cli/README.md @@ -0,0 +1,205 @@ +# Blackbox CLI — ä¼˜åŒ–å¤–éƒØé»‘ē›’ CLI ēš„ prompt ꖇ件 + +> **é€‚ē”Øåœŗę™Æ**:业劔 agent äøę˜Æęœ¬ę”†ęž¶ēš„ `LlmAgent`ļ¼Œč€Œę˜Æäø€äøŖå¤–éƒØå‘½ä»¤č”Œå·„å…·ļ¼ˆå¦‚ `claude` / `codex` / 自研 CLIļ¼‰ļ¼Œå…¶č”Œäøŗē”±č‹„å¹² prompt ę–‡ä»¶ļ¼ˆå¦‚ `CLAUDE.md` / `SKILL.md`ļ¼‰ęŽ§åˆ¶ć€‚ęœ¬ example ę¼”ē¤ŗé€ščæ‡ `subprocess` 把 CLI å½“ä½œå®Œå…Øé»‘ē›’ēš„ agent,让 GEPA ä¼˜åŒ–å®ƒčÆ»å–ēš„ prompt ę–‡ä»¶ļ¼Œę•“äøŖčæ‡ēØ‹äøäæ®ę”¹ CLI ä»£ē ć€äøē»‘å®šå…¶å†…éƒØ LLM clientć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` §2怂 + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +å¤–éƒØ CLI å·„å…·ēš„ prompt å·„ēØ‹ē‰¹ē‚¹ļ¼š + +- å·„å…·å®žēŽ°ē»†čŠ‚ļ¼ˆčÆ­čØ€ć€čæč”Œę—¶ć€å†…éƒØ LLM clientļ¼‰åÆ¹ä¼˜åŒ–å™Øå®Œå…Øé»‘ē›’ +- prompt é€ščæ‡ē‰¹å®šę–‡ä»¶å / ē›®å½•ē»“ęž„ēŗ¦å®šåŠ č½½ļ¼ˆå…øåž‹å¦‚ `CLAUDE.md` + `.claude/skills//SKILL.md`) +- CLI åÆåŠØę—¶ę˜Æē‹¬ē«‹čæ›ēØ‹ļ¼ŒäøŽä¼˜åŒ–å™Øčæ›ēØ‹é€ščæ‡ stdin / stdout / ę–‡ä»¶ē³»ē»Ÿé€šäæ” + +`AgentOptimizer` åœØę­¤åœŗę™Æäø‹ę‰®ę¼”ēŗÆå®¢ęˆ·ē«Æč§’č‰²ļ¼šé€ščæ‡ `subprocess` 启动 CLI čæ›ēØ‹ć€ęŠŠęµ‹čÆ• query ä½œäøŗå‚ę•°ä¼ å…„ć€ę”¶é›† stdout态ꌉ metric čÆ„åˆ†ć€‚ä¼˜åŒ–å™ØäøŽ CLI čæ›ēØ‹é—“ēš„å”Æäø€č€¦åˆē‚¹ę˜Æ **CLI čÆ»å–ēš„ prompt ꖇ件**ā€”ā€”ä¼˜åŒ–å™Øå†™å…„ę–°å€™é€‰ļ¼ŒCLI åœØäø‹äø€ę¬”åÆåŠØę—¶č‡ŖåŠØčÆ»å–ę–°å†…å®¹ć€‚ + +| 输兄 | 输出 | +| --- | --- | +| äø€äøŖę”ÆęŒ"åÆåŠØę—¶čÆ» prompt ꖇ件"ēš„å¤–éƒØ CLI å·„å…· | 滔足 metric é˜ˆå€¼ēš„ęœ€ä¼˜ prompt 候选 | +| CLI ęŽ„å— query ä½œäøŗå‚ę•° / stdin ēš„åč®® | CLI äŗŒčæ›åˆ¶å®Œå…Øäøå˜ļ¼Œä»…ē£ē›˜äøŠ prompt 文件被改写 | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | äø­å›½åŸŽåø‚äæ”ęÆęŸ„čÆ¢ļ¼ˆč¾“å…„åŸŽåø‚åļ¼Œč¾“å‡ŗäø„ę ¼ JSON `{city, country, is_capital}`) | +| 黑盒 CLI | `trpc-claudecode`ļ¼ˆč…¾č®Æå†…éƒØ Claude Code é€‚é…å±‚ļ¼ŒOpenAI å…¼å®¹åč®®ęŒ‡å‘ GLM-5.1) | +| ä¼˜åŒ–ē›®ę ‡ | `workspace/CLAUDE.md` + `workspace/.claude/skills/city-info/SKILL.md` 共两个文件 | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`(exact 匹配 stdout č§„čŒƒåŒ–åŽēš„ JSON) | +| 训练 / éŖŒčÆč§„ęØ” | 5 ę” / 3 ę” | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **subprocess č°ƒē”Ø** | 用 `asyncio.create_subprocess_exec` åÆåŠØå­čæ›ēØ‹ļ¼Œä¼  query 作 argv,读 stdoutć€‚å­čæ›ēØ‹ē‹¬ē«‹čæ›ēØ‹ļ¼ŒäøŽä¼˜åŒ–å™Øčæ›ēØ‹ę— ä»»ä½•čµ„ęŗå…±äŗ«ć€‚ | +| **CLI å·„ä½œē›®å½•ļ¼ˆworkspace)** | CLI åÆåŠØę—¶é€ščæ‡ `--add-dir ` ęŒ‡å®šēš„ē›®å½•ļ¼ŒCLI č‡ŖåŠØä»Žäø­åŠ č½½ prompt ę–‡ä»¶ć€‚ęœ¬ example äø­å³ `workspace/`怂 | +| **stdout č§„čŒƒåŒ–** | 用 `json.loads + json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":"))` 把 LLM č‡Ŗē”±ę–‡ęœ¬č¾“å‡ŗč½¬ę¢äøŗå”Æäø€å­—ē¬¦äø²å½¢ę€ļ¼Œä½æ metric ē›“ęŽ„čµ°ę–‡ęœ¬ē²¾ē”®åŒ¹é…ļ¼Œę— éœ€ LLM judge怂 | +| **ēŽÆå¢ƒå˜é‡ę˜ å°„** | ęŠŠé€šē”Øēš„ `TRPC_AGENT_*` äø‰ä»¶å„—ę˜ å°„ęˆ CLI ęœŸęœ›ēš„ `TRPC_CLAUDECODE_*` äø‰ä»¶å„—ļ¼Œéæå…ē”Øęˆ·äøŗ CLI å•ē‹¬é…ē½® OAuth ꈖ API key怂 | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 ä¾čµ–ę£€ęŸ„ + +```bash +which trpc-claudecode # åŗ”č¾“å‡ŗåÆę‰§č”Œč·Æå¾„ +trpc-claudecode --version # éŖŒčÆåÆę­£åøøåÆåŠØ +``` + +CLI äŗŒčæ›åˆ¶äøŗå¤–éƒØä¾čµ–ļ¼Œęœ¬ example äøé€ščæ‡ pip å®‰č£…ć€‚å…¶ä»–č‡Ŗęœ‰ CLI ę›æę¢ `CLI_BINARY` åøøé‡å³åÆć€‚ + +### 3.2 安装 SDK åÆé€‰ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +### 3.3 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +`call_agent` å†…éƒØä¼šč‡ŖåŠØęŠŠčæ™äø‰äøŖå˜é‡ę˜ å°„ęˆ `TRPC_CLAUDECODE_BASE_URL` / `TRPC_CLAUDECODE_API_KEY` / `TRPC_CLAUDECODE_MODEL`ļ¼Œå¹¶é™„åŠ  GLM-5.1 ęŽØčēš„ `CLAUDE_CODE_AUTO_COMPACT_WINDOW=165000` / `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=85`怂 + +**ꗠ需 `trpc-claudecode auth login`ļ¼Œę— éœ€ `ANTHROPIC_API_KEY`**怂 + +### 3.4 启动 + +```bash +python examples/optimization/blackbox_cli/run_optimization.py +``` + +### 3.5 äŗ§ē‰©ē»“ęž„ + +``` +runs// +ā”œā”€ā”€ result.json +ā”œā”€ā”€ summary.txt +ā”œā”€ā”€ baseline_prompts/ čæč”Œå‰ēš„ CLAUDE.md / SKILL.md 快照 +ā”œā”€ā”€ best_prompts/ val é›†å¾—åˆ†ęœ€é«˜ēš„å€™é€‰ +└── rounds/ + +workspace/ CLI å·„ä½œē›®å½•ļ¼ˆupdate_source=False ę—¶č‡ŖåŠØå›žę»šåˆ° baseline) +ā”œā”€ā”€ CLAUDE.md +└── .claude/skills/city-info/SKILL.md +``` + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +[run_optimization.py] + │ + ā”œā”€ā”€ TargetPrompt + │ .add_path("claude_md", workspace/CLAUDE.md) + │ .add_path("skill_md", workspace/.claude/skills/city-info/SKILL.md) + │ │ GEPA ęÆč½®ęŠŠå€™é€‰å†™å…„åÆ¹åŗ”ę–‡ä»¶ + │ ā–¼ + │ workspace/{CLAUDE.md, .claude/skills/city-info/SKILL.md} + │ │ CLI åÆåŠØę—¶é€ščæ‡ --add-dir č‡ŖåŠØåŠ č½½ + │ ā–¼ + └── call_agent(query): + ā”œā”€ā”€ _build_cli_env() ę˜ å°„ env 三件儗 + ā”œā”€ā”€ asyncio.create_subprocess_exec( + │ "trpc-claudecode", "--print", + │ "--add-dir", workspace/, + │ "--dangerously-skip-permissions", + │ query, + │ ) + ā”œā”€ā”€ proc.communicate(timeout=90s) + └── _normalize_response(stdout) 瓧凑 JSON 字符串 +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | ä¼˜åŒ–å™Øå…„å£ļ¼Œę³Øå†Œ `TargetPrompt` 两个文件 | č°ƒę•“ `CLAUDE_MD_PATH` / `SKILL_MD_PATH` č‡³č‡Ŗęœ‰ CLI ęœŸęœ›ēš„ę–‡ä»¶č·Æå¾„ | +| `agent/call_agent.py` | subprocess č°ƒē”Ø + env ę˜ å°„ + stdout č§„čŒƒåŒ– | **ę øåæƒę”¹é€ ē‚¹**ļ¼šę›æę¢ `CLI_BINARY` / å‘½ä»¤č”Œå‚ę•° / env ę˜ å°„č§„åˆ™ | +| `workspace/CLAUDE.md` | CLI åÆåŠØę—¶čÆ»å–ēš„äø» prompt(GEPA 写兄目标) | ę›æę¢äøŗäøšåŠ” baseline 起点 | +| `workspace/.claude/skills/city-info/SKILL.md` | CLI åÆåŠØę—¶čÆ»å–ēš„ skill ęčæ°ļ¼ˆGEPA 写兄目标) | å•ę–‡ä»¶ä¼˜åŒ–ę—¶ę•“ä½“åˆ é™¤å¹¶åŽ»ęŽ‰ `add_path("skill_md", ...)` | +| `optimizer.json` | 算法 + metric é…ē½® | č°ƒę•“é˜ˆå€¼ / åœę­¢ę”ä»¶ | +| `train.evalset.json` / `val.evalset.json` | ę•°ę®é›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ļ¼ˆreference å­—ę®µéœ€ē»čæ‡ `_normalize_response` åŒē­‰å¤„ē†ļ¼‰ | + +## 5 Ā· å…³é”®é…ē½® + +### 5.1 ęŽØčå‚ę•°å–å€¼ + +```jsonc +{ + "optimize": { + "eval_case_parallelism": 1, // 黑盒 CLI äø²č”Œęœ€ēØ³ļ¼›å¹¶å‘åÆčƒ½čø© CLI čæ›ēØ‹å¹¶å‘é—®é¢˜ + "algorithm": { + "module_selector": "round_robin", + "frontier_type": "instance", // CLI ę…¢/蓵,instance ę”¶ę•›åæ«äøęµŖč“¹č°ƒē”Ø + "use_merge": false, // éæå… metric_calls 浪蓹在 merge 上 + "reflection_minibatch_size": 3, + "max_metric_calls": 24, // CLI 一欔约 10s,24 欔约 4 分钟 + "score_threshold": 1.0 + } + } +} +``` + +| 字段 | 选择理由 | +| --- | --- | +| `eval_case_parallelism=1` | CLI å­čæ›ēØ‹å¹¶å‘å­˜åœØäøē”®å®šę€§ļ¼ˆå…±äŗ«ę–‡ä»¶é”ć€stdout ē¼“å†²ć€å­čæ›ēØ‹ę•°äøŠé™ļ¼‰ļ¼Œäø²č”Œęœ€ēØ³ | +| `frontier_type=instance` | CLI č°ƒē”Øę…¢äø”č“µļ¼Œinstance å‰ę²æåœØå°č§„ęØ”čÆ„ä¼°äø‹ę”¶ę•›ę›“åæ« | +| `use_merge=false` | merge éœ€č¦é¢å¤– metric calls;黑盒 CLI åœŗę™Æäø‹åŗ”é›†äø­é¢„ē®—åœØåę€äøŠ | +| `score_threshold=1.0` | é»‘ē›’ē»“ęž„åŒ–č¾“å‡ŗēš„ē›®ę ‡ę˜Æå®Œē¾ŽåŒ¹é… | + +### 5.2 CLI 子进程超时 + +`agent/call_agent.py` äø­ `CLI_TIMEOUT_SEC=90.0`ļ¼šå•ę¬” CLI č°ƒē”Øč¶…čæ‡ 90 ē§’č¢«å¼ŗåˆ¶ kill å¹¶ęŠ› `RuntimeError`ļ¼Œéæå…ęŸę¬” CLI å”ę­»ę‹–åž®ę•“č½®čÆ„ä¼°ć€‚äøšåŠ” CLI å¹³å‡č€—ę—¶äøåŒéœ€ē›øåŗ”č°ƒę•“ć€‚ + +## 6 Ā· 设讔要点 + +### 6.1 äøŗä»€ä¹ˆäøē”Ø `--system-prompt` 注兄 prompt + +CLI é€šåøøę”ÆęŒ `--system-prompt ""` äø€ę¬”ę€§ę³Øå…„å­—ē¬¦äø²ć€‚ä½†ęœ¬ example 使用 `--add-dir ` 让 CLI č‡Ŗå·±ä»Žē›®å½•åŠ č½½ prompt ę–‡ä»¶ļ¼ŒåŽŸå› ļ¼š + +- **ę”ÆęŒå¤šę–‡ä»¶ä¼˜åŒ–**:`CLAUDE.md` + `SKILL.md` 是 CLI ēŗ¦å®šē»“ęž„ļ¼Œå¤šę–‡ä»¶ä½œäøŗē‹¬ē«‹ `TargetPrompt` å­—ę®µę‰čƒ½č®© GEPA 选择性改写其中之一 +- **äøŽ CLI åŽŸē”Ÿå·„ä½œęµåÆ¹é½**ļ¼šäøšåŠ”ēœŸå®žä½æē”Ø CLI ę—¶ä¹Ÿę˜ÆęŠŠ prompt å†™åˆ°å·„ä½œē›®å½•ć€CLI č‡ŖåŠØå‘ēŽ°ļ¼Œęœ¬ example č·Æå¾„äøŽä¹‹äø€č‡“ + +### 6.2 äøŗä»€ä¹ˆ stdout 要做 `_normalize_response` + +LLM č¾“å‡ŗåøøåø¦å°¾éƒØē©ŗę ¼ć€JSON å‰åŽå¤šåå­—ē¬¦ē­‰å™ŖéŸ³ć€‚`_normalize_response`: + +1. ē”Øę­£åˆ™å®šä½é¦–äøŖ `{...}` 块 +2. `json.loads` + `json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":"))` ę¶ˆé™¤ē©ŗę ¼ / key é”ŗåŗå·®å¼‚ + +→ baseline äøŽå€™é€‰ prompt ēš„č¾“å‡ŗåÆ¹é½åˆ°å”Æäø€å­—ē¬¦äø²å½¢ę€ļ¼ŒåÆē›“ęŽ„čµ° `final_response_avg_score(text.match=exact)`,**čÆ„ęµ‹å±‚å®Œå…Øäøéœ€č¦ LLM judge**,CI äøŠåæ«ć€ēØ³ć€åÆé‡å¤ć€‚ + +### 6.3 subprocess äøŽ async 资源 + +å­čæ›ēØ‹ę˜Æē‹¬ē«‹ OS čæ›ēØ‹ļ¼ŒäøäøŽä¼˜åŒ–å™Øčæ›ēØ‹å…±äŗ« async čµ„ęŗļ¼ˆäŗ‹ä»¶å¾ŖēŽÆć€čæžęŽ„ę± ē­‰ļ¼‰ļ¼Œę˜Æé»‘ē›’ CLI ęØ”å¼ēš„éšę€§ä¼˜ē‚¹ļ¼šäøšåŠ” CLI ēš„å†…éƒØå¹¶å‘ęØ”åž‹åÆ¹ SDK å®Œå…ØäøåÆč§ä¹Ÿę— éœ€åÆ¹é½ć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Q:CLI åÆåŠØę…¢ļ¼ˆęÆę¬”å‡ ē§’ļ¼‰ļ¼Œę€Žä¹ˆåŠžļ¼Ÿ** +Aļ¼šå°½é‡č°ƒå° `max_metric_calls`ć€č°ƒå¤§ `reflection_minibatch_size`ļ¼ˆäø€ę¬”åę€ēœ‹ę›“å¤š case ä½†å°‘č·‘å‡ č½®ļ¼‰ć€‚å½»åŗ•ę”¹é€ éœ€å°† CLI ę”¹é€ äøŗåøøé©»ęœåŠ”ļ¼Œå‚č€ƒ `http_service/` example怂 + +**Q:CLI č¾“å‡ŗäøę˜Æ JSON ę€Žä¹ˆåŠžļ¼Ÿ** +Aļ¼šę ¹ę®äøšåŠ” metric ē±»åž‹é€‰ę‹©äøåŒč§„čŒƒåŒ–ē­–ē•„ć€‚č‹„ metric 是 `final_response_avg_score(text.match=contains)`ļ¼ŒåÆē›“ęŽ„ strip stdoutļ¼›č‹„éœ€č¦äø„ę ¼åŒ¹é…ļ¼ŒęŒ‰äøšåŠ”č¾“å‡ŗå½¢ę€ę”¹å†™ `_normalize_response`怂 + +**Q:CLI čæ›ēØ‹ę„å¤–é€€å‡ŗļ¼ˆreturncode != 0ļ¼‰ä¼šę€Žę ·ļ¼Ÿ** +A:`_run_cli` ä¼šęŠ› `RuntimeError` 携带 stderr 前 400 å­—ē¬¦ļ¼Œå¼‚åøøä¼ ę’­åˆ°ä¼˜åŒ–å™Øļ¼ŒåÆ¼č‡“å½“å‰ case čÆ„ęµ‹å¤±č“„ć€å½“å‰å€™é€‰åÆčƒ½č¢«ę‹’ē»ć€‚ + +**Q:`workspace/` åœØč¢«ä¼˜åŒ–ęœŸé—“ä¼šäøä¼šč¢«å¤šäøŖ CLI čæ›ēØ‹å¹¶å‘čÆ»å†™ļ¼Ÿ** +A:`eval_case_parallelism=1` ę—¶äøä¼šć€‚č‹„å¼ŗč”Œč°ƒé«˜å¹¶å‘ļ¼Œå¤šäøŖ CLI å®žä¾‹åÆčƒ½åŒę—¶čÆ»å–č¢«å†™å…„ēš„ prompt ę–‡ä»¶ļ¼ŒåÆ¼č‡“čÆ„ęµ‹ē»“ęžœäøäø€č‡“ā€”ā€”čæ™ę˜Æč®¾ē½® `eval_case_parallelism=1` ēš„ę ¹ęœ¬åŽŸå› ć€‚ + +**Qļ¼šč·‘å®ŒåŽęƒ³č‡ŖåŠØęŠŠ best 写回 `workspace/`?** +A:在 `run_optimization.py` äø­å°† `update_source=False` 改为 `True`怂 + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰ CLI ēš„ę­„éŖ¤ + +1. **ę›æę¢ `CLI_BINARY`**:`agent/call_agent.py` äø­ę”¹äøŗäøšåŠ” CLI åÆę‰§č”Œč·Æå¾„ +2. **č°ƒę•“å‘½ä»¤č”Œå‚ę•°**:`_run_cli` äø­ēš„ argv ę•°ē»„ęŒ‰äøšåŠ” CLI åč®®ę”¹é€ ļ¼ˆargv ä¼  query / stdin ä¼  query / `--query xxx` å½¢å¼ē­‰ļ¼‰ +3. **ę›æę¢ env ę˜ å°„**:`_build_cli_env` ę”¹äøŗäøšåŠ” CLI ęœŸęœ›ēš„ēŽÆå¢ƒå˜é‡ļ¼ˆęˆ–å¦‚äøšåŠ” CLI å·²ęœ‰ OAuth ęµēØ‹ļ¼Œåˆ é™¤čÆ„ę˜ å°„å¹¶ęē¤ŗē”Øęˆ·å…ˆå®Œęˆē™»å½•ļ¼‰ +4. **修改 `TargetPrompt`**:`run_optimization.py` äø­č°ƒę•“ `add_path` č‡³äøšåŠ” CLI ęœŸęœ›ēš„ prompt 文件路径 +5. **ę›æę¢ prompt baseline**:业劔 baseline 内容写兄对应文件 +6. **ę›æę¢ę•°ę®é›†**:`train.evalset.json` / `val.evalset.json`ļ¼Œę³Øę„ reference å­—ę®µéœ€åŒ¹é… `_normalize_response` å¤„ē†åŽēš„å½¢ę€ +7. **čæč”Œå¹¶č§‚åÆŸ**ļ¼šę ¹ę® `summary.txt` å†³å®šę˜Æå¦č°ƒå‚ diff --git a/examples/optimization/blackbox_cli/agent/__init__.py b/examples/optimization/blackbox_cli/agent/__init__.py new file mode 100644 index 0000000..f0f8bd1 --- /dev/null +++ b/examples/optimization/blackbox_cli/agent/__init__.py @@ -0,0 +1,10 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Black-box CLI agent: subprocess 调 trpc-claudecode ēœŸå®ž CLI怂""" + +from .call_agent import call_agent + +__all__ = ["call_agent"] diff --git a/examples/optimization/blackbox_cli/agent/call_agent.py b/examples/optimization/blackbox_cli/agent/call_agent.py new file mode 100644 index 0000000..ae34e57 --- /dev/null +++ b/examples/optimization/blackbox_cli/agent/call_agent.py @@ -0,0 +1,141 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Blackbox CLI ēš„ call_agent å®žēŽ°ļ¼šsubprocess č°ƒå¤–éƒØ CLI 进程。 + +é€‚ē”Øåœŗę™Æ +-------- +å½“äøšåŠ” agent ę˜Æå¤–éƒØå‘½ä»¤č”Œå·„å…·ę—¶ļ¼Œęœ¬ę–‡ä»¶ä½œäøŗä¼˜åŒ–å™ØäøŽ CLI ä¹‹é—“ēš„é€‚é…å±‚ć€‚ +SDK äøęŒęœ‰ CLI ēš„ LLM client / Runnerļ¼Œä»…é€ščæ‡ subprocess č°ƒē”Øļ¼Œę•“äøŖä¼˜åŒ– +ęµēØ‹äøŽ CLI å†…éƒØå®žēŽ°å®Œå…Øč§£č€¦ć€‚ + +核心设讔 +-------- +1. asyncio.create_subprocess_exec åÆåŠØå­čæ›ēØ‹ļ¼šquery 作 argv ä¼ å…„ļ¼Œéæå… + shell č½¬ä¹‰é—®é¢˜ć€‚å­čæ›ēØ‹ē‹¬ē«‹čæ›ēØ‹äøå— SDK å†…éƒØäŗ‹ä»¶å¾ŖēŽÆēŗ¦ęŸå½±å“ć€‚ +2. _build_cli_env ęŠŠé€šē”Ø TRPC_AGENT_* äø‰ä»¶å„—ę˜ å°„ęˆ CLI ęœŸęœ›ēš„ + TRPC_CLAUDECODE_* äø‰ä»¶å„—ļ¼Œå¹¶é™„åŠ  GLM-5.1 ęŽØčēš„ auto-compact é˜ˆå€¼ć€‚ + äøšåŠ”ę–¹ę— éœ€äøŗ CLI å•ē‹¬é…ē½® OAuth ꈖ ANTHROPIC_API_KEY怂 +3. _normalize_response 用 json.dumps(sort_keys, separators) 把 LLM 自由 + ę–‡ęœ¬č½¬ę¢äøŗå”Æäø€å­—ē¬¦äø²å½¢ę€ļ¼Œä½æ final_response_avg_score(text.match=exact) + åÆē›“ęŽ„čµ°ē²¾ē”®åŒ¹é…ļ¼ŒCI äøŠę— éœ€ LLM judge怂 +4. CLI_TIMEOUT_SEC é˜²ę­¢å•ę¬” CLI å”ę­»ę‹–åž®ę•“č½®čÆ„ä¼°ć€‚ + +ęŽ„å…„č‡Ŗęœ‰ CLI ę—¶ę”¹å“Ŗé‡Œ +--------------------- +- CLI_BINARY: ę›æę¢äøŗäøšåŠ” CLI åÆę‰§č”Œč·Æå¾„ +- _run_cli äø­ēš„ argv 数组: ęŒ‰äøšåŠ” CLI åč®®ę”¹é€ ļ¼ˆargv ä¼  query / stdin + ä¼  query / --query xxx 等) +- _build_cli_env: ę”¹äøŗäøšåŠ” CLI ęœŸęœ›ēš„ēŽÆå¢ƒå˜é‡ļ¼›å¦‚äøšåŠ” CLI å·²ęœ‰ OAuth + ęµēØ‹ļ¼Œę•“ä½“åˆ é™¤čÆ„ę˜ å°„å¹¶ęē¤ŗē”Øęˆ·å…ˆē™»å½• +- _normalize_response: ęŒ‰äøšåŠ” CLI č¾“å‡ŗę ¼å¼č°ƒę•“č§„čŒƒåŒ–é€»č¾‘ +""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +from pathlib import Path + +WORKSPACE_DIR = Path(__file__).resolve().parent.parent / "workspace" +CLI_BINARY = "trpc-claudecode" +CLI_TIMEOUT_SEC = 90.0 + + +def _build_cli_env() -> dict[str, str]: + """ęŠŠé€šē”Ø TRPC_AGENT_* äø‰ä»¶å„—ę˜ å°„ęˆ CLI ęœŸęœ›ēš„ TRPC_CLAUDECODE_* 三件儗。 + + åŒę—¶ę³Øå…„ GLM-5.1 ęŽØčēš„ auto-compact é˜ˆå€¼ļ¼ˆå‚č€ƒ trpc-claudecode å®˜ę–¹čÆ“ę˜Žļ¼‰ć€‚ + ē”Øęˆ·åŖéœ€é…é€šē”Øäø‰ä»¶å„—ļ¼Œę— éœ€äøŗ CLI å•ē‹¬é… OAuth ꈖ ANTHROPIC_API_KEY怂 + """ + env = dict(os.environ) + base_url = env.get("TRPC_AGENT_BASE_URL") + api_key = env.get("TRPC_AGENT_API_KEY") + model_name = env.get("TRPC_AGENT_MODEL_NAME") + if not (base_url and api_key and model_name): + raise RuntimeError( + "TRPC_AGENT_BASE_URL / TRPC_AGENT_API_KEY / TRPC_AGENT_MODEL_NAME " + "must be set so they can be forwarded to trpc-claudecode." + ) + env["TRPC_CLAUDECODE_BASE_URL"] = base_url + env["TRPC_CLAUDECODE_API_KEY"] = api_key + env["TRPC_CLAUDECODE_MODEL"] = model_name + env.setdefault("CLAUDE_CODE_AUTO_COMPACT_WINDOW", "165000") + env.setdefault("CLAUDE_AUTOCOMPACT_PCT_OVERRIDE", "85") + return env + + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _normalize_response(raw: str) -> str: + """把 CLI stdout č§„čŒƒåŒ–ęˆēØ³å®š JSON 字符串。 + + 歄骤: + 1. ē”Øę­£åˆ™å®šä½é¦–äøŖ {...} å—ļ¼ˆå…¼å®¹ LLM å¶å°”åœØ JSON å‰åŽå¤šåå­—ē¬¦ēš„ęƒ…å†µļ¼‰ + 2. json.loads + json.dumps(sort_keys, separators) ę¶ˆé™¤ē©ŗę ¼ / key é”ŗåŗå·®å¼‚ + 3. č§£ęžå¤±č“„ę—¶åŽŸę ·čæ”å›ž stripped stdout(让 metric ēœ‹åˆ° "garbage" → 0 åˆ†ļ¼‰ + + ē»čæ‡ęœ¬å‡½ę•°åŽ baseline äøŽå€™é€‰ prompt ēš„č¾“å‡ŗåÆ¹é½åˆ°å”Æäø€å­—ē¬¦äø²å½¢ę€ļ¼Œ + final_response_avg_score(text.match=exact) åÆē›“ęŽ„é€å­—ē¬¦ęÆ”åÆ¹ć€‚ + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def _run_cli(query: str) -> str: + """启动 CLI å­čæ›ēØ‹ļ¼Œå–‚ queryļ¼Œčæ”å›ž stdout(带 timeout äæęŠ¤ļ¼‰ć€‚ + + č¶…ę—¶åŽå¼ŗåˆ¶ kill å­čæ›ēØ‹å¹¶ęŠ› RuntimeErrorļ¼Œéæå…å•ę¬” CLI å”ę­»ę‹–åž®ę•“č½®čÆ„ä¼°ć€‚ + """ + cmd = [ + CLI_BINARY, + "--print", + "--add-dir", + str(WORKSPACE_DIR), + "--dangerously-skip-permissions", + query, + ] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_build_cli_env(), + cwd=str(WORKSPACE_DIR), + ) + try: + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(), timeout=CLI_TIMEOUT_SEC + ) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise RuntimeError( + f"trpc-claudecode timed out after {CLI_TIMEOUT_SEC}s on query={query!r}" + ) + + if proc.returncode != 0: + raise RuntimeError( + f"trpc-claudecode exited with code {proc.returncode}; " + f"stderr={stderr_b.decode('utf-8', 'replace')[:400]}" + ) + return stdout_b.decode("utf-8", "replace") + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šęŠŠ query é€ä¼ ē»™å¤–éƒØ CLI é»‘ē›’ļ¼Œčæ”å›žč§„čŒƒåŒ–åŽēš„č¾“å‡ŗć€‚""" + raw = await _run_cli(query) + return _normalize_response(raw) diff --git a/examples/optimization/blackbox_cli/optimizer.json b/examples/optimization/blackbox_cli/optimizer.json new file mode 100644 index 0000000..3e2b2e8 --- /dev/null +++ b/examples/optimization/blackbox_cli/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 24, + "score_threshold": 1.0, + "max_iterations_without_improvement": 4 + } + } +} diff --git a/examples/optimization/blackbox_cli/run_optimization.py b/examples/optimization/blackbox_cli/run_optimization.py new file mode 100644 index 0000000..ed18c30 --- /dev/null +++ b/examples/optimization/blackbox_cli/run_optimization.py @@ -0,0 +1,89 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Blackbox CLI example ēš„ä¼˜åŒ–å™Øå…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +业劔 agent ę˜Æå¤–éƒØå‘½ä»¤č”Œå·„å…·ļ¼ˆtrpc-claudecode / claude / codex / 自研 CLIļ¼‰ļ¼Œ +å…¶č”Œäøŗē”±č‹„å¹² prompt ę–‡ä»¶ęŽ§åˆ¶ć€‚ęœ¬č„šęœ¬ę¼”ē¤ŗé€ščæ‡ subprocess 把 CLI å½“ä½œå®Œå…Ø +é»‘ē›’ēš„ agent,让 GEPA ä¼˜åŒ–å®ƒčÆ»å–ēš„ prompt ꖇ件怂 + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ę³Øå†Œ workspace/CLAUDE.md + workspace/.claude/skills/city-info/SKILL.md + äø¤äøŖę–‡ä»¶ä½œäøŗ TargetPrompt +2. call_agent ē”± agent/call_agent.py ęä¾›ļ¼ˆsubprocess č°ƒē”Ø CLI + stdout č§„čŒƒåŒ–ļ¼‰ +3. 调 AgentOptimizer.optimize č·‘ GEPA åę€å¾ŖēŽÆ + +ę€Žä¹ˆč·‘ +------ +1) ę£€ęŸ„ CLI: `which trpc-claudecode` +2) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +3) python examples/optimization/blackbox_cli/run_optimization.py +4) ēœ‹ runs/<ę—¶é—“ęˆ³>/best_prompts/ + +ęŽ„å…„č‡Ŗęœ‰ CLI ę—¶ę”¹å“Ŗé‡Œ +--------------------- +- agent/call_agent.py äø­ CLI_BINARY / å‘½ä»¤č”Œå‚ę•° / env ę˜ å°„ +- TargetPrompt.add_path ę”¹äøŗäøšåŠ” CLI ęœŸęœ›ēš„ prompt 文件路径 +- å•ę–‡ä»¶ä¼˜åŒ–ę—¶ē§»é™¤ē¬¬äŗŒäøŖ add_path +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.call_agent import call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +WORKSPACE = _HERE / "workspace" +CLAUDE_MD_PATH = WORKSPACE / "CLAUDE.md" +SKILL_MD_PATH = WORKSPACE / ".claude" / "skills" / "city-info" / "SKILL.md" + + +async def main() -> None: + """ē»„č£…åŒå­—ę®µ TargetPrompt + 调 AgentOptimizer.optimize怂""" + # CLI åÆåŠØę—¶é€ščæ‡ --add-dir č‡ŖåŠØåŠ č½½čæ™äø¤äøŖę–‡ä»¶ć€‚ + # GEPA ęŠŠå€™é€‰å†™å›žę–‡ä»¶åŽļ¼Œäø‹äø€ę¬” subprocess åÆåŠØę—¶ CLI č‡ŖåŠØčÆ»åˆ°ę–° prompt怂 + target = ( + TargetPrompt() + .add_path("claude_md", str(CLAUDE_MD_PATH)) + .add_path("skill_md", str(SKILL_MD_PATH)) + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/blackbox_cli/train.evalset.json b/examples/optimization/blackbox_cli/train.evalset.json new file mode 100644 index 0000000..316fe89 --- /dev/null +++ b/examples/optimization/blackbox_cli/train.evalset.json @@ -0,0 +1,62 @@ +{ + "eval_set_id": "blackbox_cli_train", + "name": "Black-box CLI demo - train", + "description": "5 äøŖäø­å›½åŸŽåø‚ļ¼Œč¦ę±‚ trpc-claudecode 黑盒输出丄格 JSON {city,country,is_capital}怂final_response å·²č§„čŒƒåŒ–äøŗ sort_keys+ensure_ascii=False+ę— ē©ŗę ¼ēš„ JSON å­—ē¬¦äø²ļ¼ŒäøŽ call_agent č¾“å‡ŗę ¼å¼ē²¾ē”®åÆ¹é½ļ¼Œä¾æäŗŽčµ° final_response_avg_score ēš„ exact match怂", + "eval_cases": [ + { + "eval_id": "city_beijing", + "conversation": [ + { + "invocation_id": "t1", + "user_content": {"parts": [{"text": "åŒ—äŗ¬"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"åŒ—äŗ¬\",\"country\":\"中国\",\"is_capital\":true}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_shanghai", + "conversation": [ + { + "invocation_id": "t2", + "user_content": {"parts": [{"text": "上海"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"上海\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_guangzhou", + "conversation": [ + { + "invocation_id": "t3", + "user_content": {"parts": [{"text": "å¹æå·ž"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"å¹æå·ž\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_chengdu", + "conversation": [ + { + "invocation_id": "t4", + "user_content": {"parts": [{"text": "成都"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"成都\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "city_shenzhen", + "conversation": [ + { + "invocation_id": "t5", + "user_content": {"parts": [{"text": "深圳"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"深圳\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "trainer", "state": {}} + } + ] +} diff --git a/examples/optimization/blackbox_cli/val.evalset.json b/examples/optimization/blackbox_cli/val.evalset.json new file mode 100644 index 0000000..25b5201 --- /dev/null +++ b/examples/optimization/blackbox_cli/val.evalset.json @@ -0,0 +1,40 @@ +{ + "eval_set_id": "blackbox_cli_val", + "name": "Black-box CLI demo - val", + "description": "3 äøŖäø­å›½åŸŽåø‚ēš„ē•™å‡ŗéŖŒčÆé›†ć€‚", + "eval_cases": [ + { + "eval_id": "city_hangzhou", + "conversation": [ + { + "invocation_id": "v1", + "user_content": {"parts": [{"text": "ę­å·ž"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"ę­å·ž\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "city_wuhan", + "conversation": [ + { + "invocation_id": "v2", + "user_content": {"parts": [{"text": "ꭦ걉"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"ꭦ걉\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "city_xian", + "conversation": [ + { + "invocation_id": "v3", + "user_content": {"parts": [{"text": "脿安"}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"city\":\"脿安\",\"country\":\"中国\",\"is_capital\":false}"}], "role": "model"} + } + ], + "session_input": {"app_name": "blackbox_cli_demo", "user_id": "validator", "state": {}} + } + ] +} diff --git a/examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md b/examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md new file mode 100644 index 0000000..635a5f9 --- /dev/null +++ b/examples/optimization/blackbox_cli/workspace/.claude/skills/city-info/SKILL.md @@ -0,0 +1,6 @@ +--- +name: city-info +description: åŸŽåø‚äæ”ęÆ +--- + +å›žē­”å…³äŗŽåŸŽåø‚ēš„é—®é¢˜ć€‚ diff --git a/examples/optimization/blackbox_cli/workspace/CLAUDE.md b/examples/optimization/blackbox_cli/workspace/CLAUDE.md new file mode 100644 index 0000000..938b107 --- /dev/null +++ b/examples/optimization/blackbox_cli/workspace/CLAUDE.md @@ -0,0 +1 @@ +ä½ ę˜ÆåŠ©ę‰‹ć€‚å›žē­”ē”Øęˆ·é—®é¢˜ć€‚ diff --git a/examples/optimization/ci_integration/README.md b/examples/optimization/ci_integration/README.md new file mode 100644 index 0000000..e9ecfc7 --- /dev/null +++ b/examples/optimization/ci_integration/README.md @@ -0,0 +1,243 @@ +# CI Integration — čÆ„ęµ‹äøŽä¼˜åŒ–ę‹¼ęˆ CI/CD é—­ēŽÆ + +> **é€‚ē”Øåœŗę™Æ**ļ¼šäøšåŠ”åøŒęœ›åœØęŒē»­é›†ęˆęµę°“ēŗæäø­åŒę—¶čæč”Œ prompt č“Øé‡å®ˆé—Øļ¼ˆęÆę¬” PR č§¦å‘ļ¼‰äøŽ prompt č‡ŖåŠØä¼˜åŒ–ļ¼ˆå¤œé—“ēŖ—å£čæč”Œļ¼‰ļ¼Œå½¢ęˆ"PR å®ˆé—Ø → 夜闓优化 → 写回 prompt → 下一欔 PR č·‘ę–° prompt"ēš„ę¼”čæ›é—­ēŽÆć€‚ęœ¬ example 演示 `AgentEvaluator.evaluate`(pytestļ¼‰äøŽ `AgentOptimizer.optimize` å…±äŗ«åŒäø€ä»½ę•°ę®é›†ć€åŒäø€äøŖ `call_agent`ć€åŒäø€åÆ¹ prompt ę–‡ä»¶ēš„ē«Æåˆ°ē«Æé›†ęˆę–¹å¼ć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` §2怂 + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +prompt å·„ēØ‹åœØå·„ēØ‹åŒ–åœŗę™Æäø‹ēš„äø¤ē±»éœ€ę±‚ļ¼š + +- **PR å®ˆé—Ø**ļ¼šęÆę¬” PR č§¦å‘č‡ŖåŠØč·‘čÆ„ä¼°ļ¼Œåˆ†ę•°ä½ŽäŗŽé˜ˆå€¼å³ CI ēŗ¢ēÆļ¼Œé˜»ę­¢åŠ£åŒ– prompt 进主干 +- **夜闓优化**ļ¼šåœØä½Žå³°ęœŸč·‘åę€ä¼˜åŒ–ļ¼ŒęŠŠę›“ä¼˜ prompt å†™å›žęŗę–‡ä»¶ļ¼Œäø‹äø€ę¬” PR č‡ŖåŠØē”ØäøŠ + +å•ē‹¬ä½æē”Øä»»äø€é“¾č·Æéƒ½äøč¶³ļ¼šēŗÆå®ˆé—Øäøä¼šč®© prompt č‡ŖåŠØå˜å„½ļ¼ŒēŗÆä¼˜åŒ–ę²”ęœ‰č“Øé‡é—Øē¦ć€‚ęœ¬ example ęŠŠäø¤č€…é›†ęˆåˆ°åŒäø€ä»½čµ„äŗ§ä¹‹äøŠļ¼š + +- **åŒäø€ä»½ evalset**ļ¼šē‰©ē†äøŠę‹† train / val(SDK å¼ŗåˆ¶ēŗ¦ęŸļ¼Œé˜²ę³„ę¼ļ¼‰ļ¼Œé€»č¾‘äøŠę˜Æäø€å„—čæžē»­čÆ­ę–™ +- **åŒäø€äøŖ `call_agent`**:pytest äøŽ optimizer 都从 `agent/agent.py` åÆ¼å…„ē›øåŒå®žēŽ°ļ¼Œprompt ę”¹åŠØäø€å¤„ē”Ÿę•ˆ +- **åŒäø€åÆ¹ prompt ꖇ件**:optimizer 用 `update_source=True` å†™å›žęŗę–‡ä»¶ļ¼Œpytest äø‹ę¬”č‡ŖåŠØčÆ»å– + +| 输兄 | 输出 | +| --- | --- | +| 一份 evalsetļ¼ˆę‹†äøŗ train / val 两文件)+ 一个 call_agent + 一对 prompt ꖇ件 | PR 阶段:pytest ēŗ¢ / 绿 + JUnit XML | +| 两个 shell å…„å£ļ¼ˆPR ę£€ęŸ„ + å¤œé—“ä¼˜åŒ–ļ¼‰ | å¤œé—“é˜¶ę®µļ¼šęŗ prompt ę–‡ä»¶č¢«ęœ€ä¼˜å€™é€‰č¦†ē›– | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | RESTful API ęčæ° → äø„ę ¼ JSON ē»“ęž„åŒ–ę‘˜č¦ | +| ä¼˜åŒ–ē›®ę ‡ | `agent/prompts/system.md` + `agent/prompts/skill.md` | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`(exact åŒ¹é…č§„čŒƒåŒ– JSON,CI äøŠę— éœ€ LLM judge) | +| 训练 / éŖŒčÆč§„ęØ” | 见 `data/train.evalset.json` / `data/val.evalset.json` | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **PR å®ˆé—Øļ¼ˆpre-merge gate)** | 在 PR č§¦å‘ēš„ CI 流程中跑 `AgentEvaluator.evaluate`ļ¼Œåˆ†ę•°ä½ŽäŗŽé˜ˆå€¼ę—¶ pytest ꊛ `AssertionError`态CI exit code 非 0ć€åˆå¹¶č¢«é˜»ę­¢ć€‚ | +| **å¤œé—“ä¼˜åŒ–ļ¼ˆnightly optimize)** | 在 CI ä½Žå³°ēŖ—å£č·‘ `AgentOptimizer.optimize`,`update_source=True` ę—¶ä¼˜åŒ–ē»“ęŸåŽęœ€ä¼˜å€™é€‰č‡ŖåŠØč¦†ē›–ęŗ prompt ꖇ件怂 | +| **`update_source=True`** | ä¼˜åŒ–ęˆåŠŸļ¼ˆ`OptimizeResult.status=SUCCEEDED`ļ¼‰åŽē”Øęœ€ä¼˜å€™é€‰č¦†ē›– `TargetPrompt` ę³Øå†Œēš„ęŗę–‡ä»¶ć€‚CI é—­ēŽÆēš„å…³é”®å¼€å…³ć€‚ | +| **JUnit XML** | pytest `--junitxml=` č¾“å‡ŗēš„ę ‡å‡†åŒ–ęµ‹čÆ•ęŠ„å‘Šę ¼å¼ć€‚GitHub Actions / č“ē›¾ęµę°“ēŗæ / Tencent CI ē­‰äø»ęµå¹³å°å‡åŽŸē”Ÿč§£ęžć€‚ | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +pip install pytest pytest-asyncio +``` + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 PR 阶段:pytest å®ˆé—Ø + +```bash +cd examples/optimization/ci_integration +PYTHONPATH=../../.. bash ci/run_pr_check.sh +``` + +蔌为: + +- pytest 加载 `tests/test_agent_quality.py` → 调 `AgentEvaluator.evaluate(call_agent, val.evalset.json, ...)` +- å¤±č“„ę—¶ę”†ęž¶ęŠ› `AssertionError` → pytest exit code != 0 → CI 红灯 +- JUnit XML 落到 `runs/pytest_report.xml`,CI å¹³å°åŽŸē”Ÿå±•ē¤ŗ + +### 3.4 å¤œé—“ēŖ—å£ļ¼šč·‘ä¼˜åŒ–å¹¶å†™å›ž + +```bash +cd examples/optimization/ci_integration +PYTHONPATH=../../.. bash ci/run_nightly_optimize.sh +``` + +蔌为: + +- `AgentOptimizer.optimize(update_source=True)` č·‘ GEPA åę€ +- ä¼˜åŒ–ęˆåŠŸåŽęœ€ä¼˜å€™é€‰č¦†ē›– `agent/prompts/system.md` + `agent/prompts/skill.md` +- ēœŸå®žęµę°“ēŗæåœØęœ«å°¾åÆåŠ  `git diff agent/prompts/` + č‡ŖåŠØå¼€ PR + +下一欔 PR č§¦å‘ēš„ `run_pr_check.sh` č‡ŖåŠØē”ØäøŠę–° prompt → é—­ēŽÆč¾¾ęˆć€‚ + +### 3.5 äŗ§ē‰©ē»“ęž„ + +``` +runs/ +ā”œā”€ā”€ pytest_eval/ # AgentEvaluator č¾“å‡ŗļ¼ˆpytest é˜¶ę®µļ¼‰ +ā”œā”€ā”€ pytest_report.xml # JUnit XML(CI å¹³å°åŽŸē”Ÿę¶ˆč“¹ļ¼‰ +└── optimize_/ # AgentOptimizer č¾“å‡ŗļ¼ˆå¤œé—“é˜¶ę®µļ¼‰ + ā”œā”€ā”€ result.json + ā”œā”€ā”€ summary.txt + ā”œā”€ā”€ baseline_prompts/ + ā”œā”€ā”€ best_prompts/ + └── rounds/ +``` + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + │ agent/ │ + │ ā”œā”€ā”€ agent.py (call_agent) │ + │ └── prompts/ │ + │ ā”œā”€ā”€ system.md │ + │ └── skill.md │ + ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ 共享 + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”“ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + │ │ + ā”Œā”€ā”€ā”€ā”€ā–¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā–¼ā”€ā”€ā”€ā”€ā”€ā” + │ AgentEvaluator │ │ AgentOptimizer │ + │ .evaluate() │ │ .optimize() │ + │ │ │ │ + │ č§¦å‘: PR │ │ č§¦å‘: å¤œé—“ēŖ—å£ │ + │ ę•°ę®: val.json │ │ ę•°ę®: train + val │ + │ 产出: ēŗ¢/绿 │ │ 产出: 写回 prompt │ + │ 退出码: å®ˆé—Ø │ │ update_source=True │ + ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ + └─→ å…±äŗ«åŒäø€ä»½ data/ + åŒäø€ä»½ metric 定义 +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | å¤œé—“ä¼˜åŒ–å…„å£ļ¼Œ`update_source=True` | äøŽ quickstart åŒļ¼›äæęŒ `update_source=True` | +| `agent/agent.py` | 共享 `call_agent`(被 pytest + optimizer åŒę—¶č°ƒē”Øļ¼‰ | ę›æę¢äøŗäøšåŠ” agentļ¼›äæęŒå‡½ę•°å‘½å `call_agent` | +| `agent/prompts/{system,skill}.md` | ä¼˜åŒ–ē›®ę ‡ + å®ˆé—ØčÆ»å–ēš„ prompt | å†™å…„äøšåŠ” baseline | +| `tests/test_agent_quality.py` | pytest å®ˆé—Øå…„å£ | č°ƒę•“ `agent_name` / ę•°ę®č·Æå¾„ļ¼›ęµ‹čÆ•ę–¹ę³•äæęŒäøå˜ | +| `optimizer.json` | 算法 + metric é…ē½® | äøŽå• example 写法一臓 | +| `data/train.evalset.json` / `data/val.evalset.json` | 训练 / éŖŒčÆé›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ | +| `ci/run_pr_check.sh` | PR 阶ꮵ shell 兄口 | č°ƒę•“ `pytest` å‚ę•° / `--junitxml` 路径 | +| `ci/run_nightly_optimize.sh` | å¤œé—“é˜¶ę®µ shell 兄口 | č°ƒę•“č°ƒē”Øé¢‘ēŽ‡ / å¤±č“„å›žę»šē­–ē•„ | + +### 4.2 train / val ę‹†åˆ†ēš„å¼ŗåˆ¶ēŗ¦ęŸ + +SDK `_validate_inputs` 会栔验 `train_dataset_path != validation_dataset_path` é˜²ę­¢ę•°ę®ę³„ę¼ć€‚ē‰©ē†äøŠåæ…é”»ę˜Æäø¤äøŖę–‡ä»¶ļ¼Œä½†äŗŒč€…ļ¼š + +- schema å®Œå…Øäø€č‡“ļ¼ˆåŒäø€å„— `EvalSet` ęØ”åž‹ļ¼‰ +- `eval_set_id` å‘½åę—åÆå…±äŗ«ļ¼ˆå¦‚ `api_summarizer.train` / `api_summarizer.val`) +- metric å®šä¹‰ē»Ÿäø€åœØ `optimizer.json` 中(pytest čµ° `test_config.json` åŒē›®å½•ēŗ¦å®šę—¶ä¹Ÿę˜Æäø€č‡“ schema) + +é€»č¾‘äøŠä»ę˜ÆåŒäø€å„—ę•°ę®čÆ­čØ€ć€‚ + +## 5 Ā· å…³é”®é…ē½® + +### 5.1 `update_source=True` ēš„čÆ­ä¹‰ + +```python +await AgentOptimizer.optimize( + ..., + update_source=True, # ä¼˜åŒ–ęˆåŠŸåŽč¦†ē›–ęŗ prompt ꖇ件 +) +``` + +| ēŠ¶ę€ | 蔌为 | +| --- | --- | +| `OptimizeResult.status=SUCCEEDED` | ęœ€ä¼˜å€™é€‰å†™å…„ `TargetPrompt` ę³Øå†Œēš„ęŗę–‡ä»¶ | +| `status=FAILED` / `BUDGET_EXHAUSTED` 等其他 | ęŗę–‡ä»¶äæęŒäøå˜ļ¼›å€™é€‰åŖåœØ `runs//best_prompts/` | + +CI é—­ēŽÆä¾čµ–čÆ„å¼€å…³ļ¼šåŖęœ‰"ä¼˜åŒ–ēœŸēš„ę‰¾åˆ°äŗ†ę›“å„½ēš„ prompt"ę‰ä¼šå†™å›žļ¼›å¦åˆ™äæęŒēŽ°ēŠ¶äøę±”ęŸ“äø»å¹²ć€‚ + +### 5.2 CI äøŠäøä¾čµ– LLM judge + +`agent/agent.py` äø­ `_normalize_json` 把 LLM č¾“å‡ŗč§„čŒƒåŒ–ęˆēØ³å®š JSON å­—ē¬¦äø²ļ¼š + +```python +json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) +``` + +→ `final_response_avg_score(text.match=exact)` åÆē›“ęŽ„é€å­—ē¬¦ęÆ”åÆ¹ļ¼Œ**CI äøŠå®Œå…Øäøéœ€č¦ LLM judge**: + +- **åæ«**ļ¼šå• case 一欔评测仅一欔 agent LLM č°ƒē”Ø +- **稳**ļ¼šåŒäø€ prompt åŒäø€ case č¾“å‡ŗē”®å®šļ¼ˆtemperature=0.1) +- **åÆé‡å¤**:CI å¤šę¬”č·‘ē»“ęžœäø€č‡“ + +LLM judge åœØäø»č§‚ē»“åŗ¦čÆ„ä¼°äøŠäøåÆę›æä»£ļ¼Œä½†åœØē»“ęž„åŒ–č¾“å‡ŗåœŗę™Æäø‹åŗ”ä¼˜å…ˆé€‰ę‹© text exact + č§„čŒƒåŒ–ę–¹ę”ˆć€‚ + +### 5.3 失蓄 case ēš„åÆč§‚ęµ‹ę€§ + +pytest é˜¶ę®µå¤±č“„ę—¶ę”†ęž¶ęŠ› `AssertionError`ļ¼Œé”™čÆÆę¶ˆęÆåŒ…å«ęÆę” case ēš„å¤±č“„ę˜Žē»† JSON怂CI å¹³å°å±•ē¤ŗ stack trace ę—¶åÆē›“ęŽ„ēœ‹åˆ°å…·ä½“å“Ŗę” case 失蓄、agent å®žé™…č¾“å‡ŗę˜Æä»€ä¹ˆć€äøŽ expected ēš„å·®å¼‚åœØå“Ŗć€‚ę— éœ€é¢å¤–ę—„åæ—č§£ęžé€»č¾‘ć€‚ + +## 6 Ā· CI/CD é—­ēŽÆč®¾č®”č¦ē‚¹ + +### 6.1 äøŗä»€ä¹ˆ evaluate äøŽ optimize 共享 call_agent + +prompt å·„ēØ‹ēš„ę øåæƒēŗ¦ęŸļ¼š**čÆ„ęµ‹ę—¶ä½æē”Øēš„ agent å’Œä¼˜åŒ–ę—¶ä½æē”Øēš„ agent 必锻等价**ļ¼Œå¦åˆ™ä¼˜åŒ–ę–¹å‘äøŽå®ˆé—Øę–¹å‘äøäø€č‡“ļ¼Œä¼šå‡ŗēŽ°"ä¼˜åŒ–å™Øę‰¾åˆ°äŗ† evaluator éŖŒčÆäøäŗ†ēš„å„½ prompt"ęˆ–åå‘é—®é¢˜ć€‚ + +é€ščæ‡å…±äŗ« `agent/agent.py` äø­ēš„ `call_agent` å®žēŽ°ļ¼Œä»Žä»£ē å±‚é¢äæčÆē­‰ä»·ę€§ć€‚ä»»ä½• agent č”Œäøŗę”¹åŠØļ¼ˆęØ”åž‹åˆ‡ę¢ć€temperature č°ƒę•“ć€output schema å˜åŒ–ļ¼‰åŖéœ€ę”¹äø€å¤„ļ¼Œpytest äøŽ optimizer åŒę—¶ē”Ÿę•ˆć€‚ + +### 6.2 äøŗä»€ä¹ˆå¤œé—“ēŖ—å£č·‘č€Œäøå®žę—¶ä¼˜åŒ– + +- LLM č°ƒē”Øé¢„ē®—ęœ‰é™ļ¼Œåę€ä¼˜åŒ–č€—ę—¶ę•°åˆ†é’Ÿåˆ°ę•°ååˆ†é’Ÿļ¼Œäøé€‚åˆ PR č§¦å‘ +- ä¼˜åŒ–ē»“ęžœå…·ęœ‰ę–¹å·®ę€§ļ¼Œéœ€åœØä½Žå³°ęœŸåå¤å¤šč½®éŖŒčÆåŽå†å‘ē‰ˆ +- 写回 prompt ę–‡ä»¶åŗ”ē»čæ‡ git diff / äŗŗå·„ review / č‡ŖåŠØå¼€ PR ē­‰ęµēØ‹č€Œéžē›“ęŽ„čæ›äø»å¹² + +### 6.3 ä½•ę—¶č€ƒč™‘ē°åŗ¦å‘åøƒ + +`update_source=True` ē›“ęŽ„č¦†ē›–ęŗę–‡ä»¶é€‚åˆļ¼š + +- å›¢é˜Ÿč§„ęØ”å°ļ¼ŒPR review ęµēØ‹å·²čƒ½ę‹¦ęˆŖäøåˆē†ę”¹åŠØ +- prompt ę”¹åŠØå½±å“é¢åÆęŽ§ļ¼ˆå• agent / å•äøšåŠ”ļ¼‰ + +äøé€‚åˆļ¼š + +- å¤šäøšåŠ”ēŗæå…±äŗ«åŒäø€ä»½ prompt 仓库 +- ę”¹åŠØåŽéœ€ē°åŗ¦č§‚åÆŸēŗæäøŠ metric 变化 + +åŽč€…å»ŗč®®ę”¹äøŗ `update_source=False` + 把 `runs//best_prompts/` ęŽ„å…„äøšåŠ”č‡Ŗęœ‰ēš„ē°åŗ¦å‘åøƒå·„å…·ć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Qļ¼ščƒ½å¦åœØåŒäø€äøŖ CI job äø­å…ˆč·‘čÆ„ä¼°å†č·‘ä¼˜åŒ–ļ¼Ÿ** +Aļ¼šęŠ€ęœÆäøŠåÆč”Œļ¼Œä½†äøęŽØčć€‚čÆ„ä¼°åŗ”åæ«é€Ÿē»™åé¦ˆļ¼ˆ< 1 minļ¼‰ļ¼Œä¼˜åŒ–č€—ę—¶é•æļ¼ˆ> 5 minļ¼‰ć€‚äø¤č€…ę‹†ęˆē‹¬ē«‹ job / 独立 trigger ę›“ē¬¦åˆå·„ēØ‹å®žč·µć€‚ + +**Q:CI å¦‚ä½•čÆ†åˆ«"ä¼˜åŒ–ę²”ę”¹å–„"čÆ„å¦‚ä½•å›žé€€ļ¼Ÿ** +A:`run_nightly_optimize.sh` 末尾建议加 `git diff --quiet agent/prompts/` åˆ¤ę–­ę˜Æå¦ęœ‰ę”¹åŠØļ¼›ę— ę”¹åŠØē›“ęŽ„é€€å‡ŗć€‚å¦‚ęžœę”¹åŠØč“Øé‡åŽē»­č¢«å‘ēŽ°é€€åŒ–ļ¼Œē”± PR review ę‹’ē»åˆå¹¶å³åÆā€”ā€”`update_source=True` ēš„å†™å›žäøē›“ęŽ„čæ›äø»å¹²ļ¼Œä»čµ°ę ‡å‡† PR 流程。 + +**Q:pytest äøŽ optimizer ē”Øēš„ metric é…ē½®äøåŒä¼šę€Žę ·ļ¼Ÿ** +Aļ¼šä¼šå‡ŗēŽ°"čÆ„ęµ‹čƒ½čæ‡ä½†ä¼˜åŒ–å™Øēœ‹åˆ°ēš„åˆ†ę•°ä½Ž"ęˆ–åå‘é—®é¢˜ć€‚ęœ¬ example é€ščæ‡č®© pytest čµ° `AgentEvaluator.evaluate(test_config_path=...)`态optimizer čµ° `optimizer.json.evaluate.metrics`ć€äŗŒč€…ä½æē”Øē›øåŒ schema ę„éæå…ę¼‚ē§»ć€‚ē”Ÿäŗ§äø­å»ŗč®®ęŠŠ metric é…ē½®ęŠ½ęˆäø€ä»½å…±äŗ« JSONļ¼Œäø¤č¾¹å¼•ē”Øć€‚ + +**Q:reflection LM å¤±č“„é‡čÆ•é¢„ē®—ļ¼Ÿ** +A:`optimizer.json` äø­ `algorithm.max_iterations_without_improvement` ęŽ§åˆ¶ę— ę”¹å–„ę—©åœļ¼›reflection LM å•ę¬”č°ƒē”Øå¤±č“„ē”± SDK å†…éƒØé‡čÆ• 1–2 ꬔ怂CI åœŗę™Æå»ŗč®®ęŠŠ `max_metric_calls` č°ƒåˆ°åˆē†äøŠé™éæå…å•ę¬”č·‘ēˆ†é¢„ē®—ć€‚ + +**Q:JUnit XML äø­čƒ½ēœ‹åˆ°å…·ä½“å¤±č“„ case å—ļ¼Ÿ** +Aļ¼ščƒ½ć€‚pytest 把 `AssertionError` ę¶ˆęÆåŽŸę ·å†™å…„ XML,CI å¹³å°å±•ē¤ŗę—¶åÆē›“ęŽ„ēœ‹åˆ°å¤±č“„ę˜Žē»† JSON怂 + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰ CI ēš„ę­„éŖ¤ + +1. **敓理 evalset**ļ¼šę‹† `train.evalset.json` / `val.evalset.json` 两文件 +2. **定义 metric**:在 `optimizer.json` äøŽ pytest ęµ‹čÆ•äø­ä½æē”ØåŒäø€ schema ēš„ metric é…ē½® +3. **å®žēŽ°å…±äŗ« call_agent**:`agent/agent.py` 写一份 `call_agent`,pytest äøŽ optimizer éƒ½ä»Žę­¤å¤„åÆ¼å…„ +4. **设置 `update_source=True`**ļ¼šå¤œé—“ä¼˜åŒ–å…„å£ēš„å…³é”®å¼€å…³ +5. **é…ē½® CI 流氓线**: + - PR č§¦å‘ `bash ci/run_pr_check.sh`ļ¼Œč§£ęž `runs/pytest_report.xml` + - å¤œé—“č§¦å‘ `bash ci/run_nightly_optimize.sh`,末尾加 git diff + č‡ŖåŠØå¼€ PR +6. **č§‚åÆŸé¦–č½®é—­ēŽÆ**ļ¼šä»Ž baseline pytest ēŗ¢ → 夜闓优化 → PR č‡ŖåŠØå¼€ → review → 合并 → 下一欔 PR 绿 diff --git a/examples/optimization/ci_integration/agent/__init__.py b/examples/optimization/ci_integration/agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/optimization/ci_integration/agent/agent.py b/examples/optimization/ci_integration/agent/agent.py new file mode 100644 index 0000000..048ced1 --- /dev/null +++ b/examples/optimization/ci_integration/agent/agent.py @@ -0,0 +1,156 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""API ę‘˜č¦ Agent —— evaluate äøŽ optimize åŒé“¾č·Æå…±äŗ«ēš„ē»Ÿäø€å…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +CI/CD é—­ēŽÆēš„ę øåæƒēŗ¦ęŸļ¼ščÆ„ęµ‹ę—¶ēš„ agent äøŽä¼˜åŒ–ę—¶ēš„ agent åæ…é”»ē­‰ä»·ć€‚ęœ¬ę–‡ä»¶ +作为 PR å®ˆé—Øļ¼ˆpytestļ¼‰äøŽå¤œé—“ä¼˜åŒ–ļ¼ˆAgentOptimizer.optimizeļ¼‰å…±äŗ«ēš„ +call_agent å®žēŽ°ļ¼ŒäæčÆäø¤ę”é“¾č·Æēœ‹åˆ°ē›øåŒ agent č”Œäøŗć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. 暓露 SYSTEM_PROMPT_PATH / SKILL_PATH 作为 TargetPrompt ę³Øå†Œē›®ę ‡ +2. ęä¾› call_agent é»‘ē›’å…„å£ļ¼ˆč¢« pytest + optimizer åŒę—¶č°ƒē”Øļ¼‰ +3. 用 _normalize_json 把 LLM č¾“å‡ŗč§„čŒƒåŒ–äøŗēØ³å®š JSON å­—ē¬¦äø²ļ¼Œä½æ metric + čµ° text exact č€Œéžä¾čµ– LLM judge——CI äøŠåæ«ć€ēØ³ć€åÆé‡å¤ + +äøŗä»€ä¹ˆ evaluate äøŽ optimize 要共享 call_agent +--------------------------------------------- +é€ščæ‡å…±äŗ«åŒäø€ä»½ä»£ē ļ¼ŒäæčÆä»»ä½• agent č”Œäøŗę”¹åŠØļ¼ˆęØ”åž‹åˆ‡ę¢ć€temperature č°ƒę•“ć€ +output schema å˜åŒ–ļ¼‰åŖéœ€ę”¹äø€å¤„ļ¼ŒPR å®ˆé—ØäøŽå¤œé—“ä¼˜åŒ–åŒę—¶ē”Ÿę•ˆć€‚å¦åˆ™ä¼šå‡ŗēŽ° +"ä¼˜åŒ–å™Øę‰¾åˆ°äŗ† evaluator éŖŒčÆäøäŗ†ēš„ prompt"čæ™ē§é“¾č·Æå¤±é…é—®é¢˜ć€‚ +""" + +from __future__ import annotations + +import json +import re +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" +SKILL_PATH = Path(__file__).parent / "prompts" / "skill.md" + +APP_NAME = "ci_integration_demo" + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _create_model() -> LLMModel: + """ęž„å»ŗ OpenAI 兼容 chat ęØ”åž‹å®žä¾‹ć€‚""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """ä»Žäø¤äøŖ prompt ę–‡ä»¶ę‹¼åˆå®Œę•“ instruction怂 + + ęÆę¬”č°ƒē”Øéƒ½é‡čÆ»ē£ē›˜ā€”ā€”å¤œé—“ä¼˜åŒ–é˜¶ę®µ GEPA ęŠŠå€™é€‰å†™åˆ°ē£ē›˜åŽäø‹äø€ę¬”ęŽØē† + ē«‹å³ē”Ÿę•ˆļ¼›PR é˜¶ę®µę‹æåˆ°ēš„ä¹Ÿę˜Æęœ€ę–°å·²č½ē›˜ēš„ē‰ˆęœ¬ć€‚ + """ + system = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + skill = SKILL_PATH.read_text(encoding="utf-8").strip() + return f"{system}\n\n## How to write the summary\n{skill}" + + +def create_agent() -> LlmAgent: + """ęž„å»ŗäø€äøŖä½æē”Øå½“å‰ē£ē›˜ prompt ēš„ę–° LlmAgent å®žä¾‹ć€‚""" + return LlmAgent( + name="api_summarizer", + description="Summarizes a RESTful API description into a strict JSON.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.1, + top_p=0.9, + max_output_tokens=512, + ), + ) + + +# 兼容 agent_module="agent" 加载约定(root_agent)。 +# AgentEvaluator 在 call_agent ęØ”å¼äø‹å¹¶äøéœ€č¦å®ƒļ¼Œä½†äæē•™ę— å®³ļ¼Œę–¹ä¾æåˆ‡ę¢å½¢ę€ć€‚ +root_agent = create_agent() + + +def _normalize_json(raw: str) -> str: + """把 LLM č¾“å‡ŗč§„čŒƒåŒ–ęˆēØ³å®š JSON 字符串。 + + 歄骤: + 1. ē”Øę­£åˆ™å®šä½é¦–äøŖ {...} å—ļ¼ˆå…¼å®¹ęØ”åž‹å¶å°”åœØ JSON å‰åŽå¤šåå­—ē¬¦ļ¼‰ + 2. json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":")) + ę¶ˆé™¤ē©ŗę ¼ / key é”ŗåŗå·®å¼‚ + 3. č§£ęžå¤±č“„ę—¶åŽŸę ·čæ”å›žļ¼ˆč®© metric ēœ‹åˆ° "garbage" → 0 åˆ†ļ¼‰ + + ē»čæ‡ęœ¬å‡½ę•°åŽ baseline / 候选 prompt / evalset ęœŸęœ›å€¼éƒ½åÆ¹é½åˆ°å”Æäø€ + å­—ē¬¦äø²å½¢ę€ļ¼ŒåÆē›“ęŽ„čµ° final_response_avg_score(text.match=exact)怂 + CI 上**å®Œå…Øäøä¾čµ– LLM judge**ļ¼Œé€Ÿåŗ¦äøŽēØ³å®šę€§ę˜¾č‘—ęå‡ć€‚ + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def call_agent(query: str) -> str: + """供 evaluate / optimize å…±äŗ«ēš„é»‘ē›’ agent å…„å£ć€‚ + + ęÆę¬”č°ƒē”Øéƒ½é‡ę–°ęž„å»ŗ Runner + InMemorySessionServiceļ¼Œē»™ęÆäøŖ case ē‹¬ē«‹ēš„ + session stateļ¼Œå¹¶å‘čÆ„ęµ‹ę—¶äøäŗ’ē›øę±”ęŸ“ć€‚ + """ + root = create_agent() + session_service = InMemorySessionService() + runner = Runner( + app_name=APP_NAME, + agent=root, + session_service=session_service, + ) + session_id = str(uuid.uuid4()) + user_id = "ci" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return _normalize_json(final_text) diff --git a/examples/optimization/ci_integration/agent/config.py b/examples/optimization/ci_integration/agent/config.py new file mode 100644 index 0000000..d0a64b1 --- /dev/null +++ b/examples/optimization/ci_integration/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpoint + TRPC_AGENT_MODEL_NAME ęØ”åž‹å + +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/ci_integration/agent/prompts/skill.md b/examples/optimization/ci_integration/agent/prompts/skill.md new file mode 100644 index 0000000..5d77a12 --- /dev/null +++ b/examples/optimization/ci_integration/agent/prompts/skill.md @@ -0,0 +1 @@ +Describe the API briefly. diff --git a/examples/optimization/ci_integration/agent/prompts/system.md b/examples/optimization/ci_integration/agent/prompts/system.md new file mode 100644 index 0000000..ffc8e8f --- /dev/null +++ b/examples/optimization/ci_integration/agent/prompts/system.md @@ -0,0 +1,10 @@ +You are an API summarizer. Output a single JSON object describing the API. + +The JSON must have exactly these four keys and no others: "auth", "method", "path", "summary". + +- "auth": use the string "none" if no authentication is needed, or "required" if authentication is needed. Do not use any other value. +- "method": the HTTP method as a string (e.g., "GET", "POST", "PUT", "DELETE"). +- "path": the endpoint path string (e.g., "/users/{id}"). +- "summary": a short imperative verb phrase (e.g., "Get user profile", "Cancel order", "Login with credentials"). Do not include articles or extra words. + +Output compact JSON with no whitespace after colons or commas, and no trailing newline. Output nothing except the JSON object. \ No newline at end of file diff --git a/examples/optimization/ci_integration/ci/run_nightly_optimize.sh b/examples/optimization/ci_integration/ci/run_nightly_optimize.sh new file mode 100755 index 0000000..d9aeac4 --- /dev/null +++ b/examples/optimization/ci_integration/ci/run_nightly_optimize.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# å¤œé—“ēŖ—å£ļ¼šč·‘ GEPA ä¼˜åŒ–ļ¼ŒęŠŠęœ€ä¼˜ prompt ē›“ęŽ„å†™å›žęŗę–‡ä»¶ć€‚ +# ēœŸå®ž CI é‡Œé€šåøøä¼šåœØęœ«å°¾čæ½åŠ  `git diff` ēœ‹ę˜Æå¦ęœ‰ę”¹åŠØļ¼Œå†å¼€ PR怂 +set -euo pipefail +cd "$(dirname "$0")/.." + +PY="${PYTHON:-python3}" +"$PY" run_optimization.py + +echo "" +echo "=== Optimization done ===" +echo "Best prompts have been written back to:" +echo " agent/prompts/system.md" +echo " agent/prompts/skill.md" +echo "" +echo "Next steps for a real CI pipeline:" +echo " git diff agent/prompts/ # see what GEPA changed" +echo " git checkout -b auto/optimize-\$(date +%Y%m%d)" +echo " git add agent/prompts/ && git commit -m 'auto: optimize prompts'" +echo " # then open a PR; PR check (run_pr_check.sh) re-validates the new prompts." diff --git a/examples/optimization/ci_integration/ci/run_pr_check.sh b/examples/optimization/ci_integration/ci/run_pr_check.sh new file mode 100755 index 0000000..5087449 --- /dev/null +++ b/examples/optimization/ci_integration/ci/run_pr_check.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# PR é˜¶ę®µļ¼šč·‘ pytest å®ˆęŠ¤č“Øé‡åŗ•ēŗæć€‚ +# - 失蓄时 pytest exit code != 0 → CI 红灯 +# - --junitxml 产出 GitHub Actions / Tencent CI / č“ē›¾ęµę°“ēŗæå‡åŽŸē”Ÿę”ÆęŒēš„ęŠ„å‘Š +set -euo pipefail +cd "$(dirname "$0")/.." + +mkdir -p runs + +PY="${PYTHON:-python3}" +exec "$PY" -m pytest tests/ -v --tb=short \ + --junitxml=runs/pytest_report.xml diff --git a/examples/optimization/ci_integration/data/test_config.json b/examples/optimization/ci_integration/data/test_config.json new file mode 100644 index 0000000..0ac840f --- /dev/null +++ b/examples/optimization/ci_integration/data/test_config.json @@ -0,0 +1,14 @@ +{ + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "exact", "case_insensitive": false} + } + } + } + ], + "num_runs": 1 +} diff --git a/examples/optimization/ci_integration/data/train.evalset.json b/examples/optimization/ci_integration/data/train.evalset.json new file mode 100644 index 0000000..0257d83 --- /dev/null +++ b/examples/optimization/ci_integration/data/train.evalset.json @@ -0,0 +1,40 @@ +{ + "eval_set_id": "ci_integration_train", + "name": "CI Integration - train", + "description": "Optimize é˜¶ę®µåę€ minibatch ę„ęŗć€‚schema äøŽ val.evalset.json å®Œå…Øäø€č‡“ļ¼›äøŽ evaluate é˜¶ę®µå…±ē”ØåŒäø€äøŖ test_config.jsonļ¼ˆåŒē›®å½•ēŗ¦å®šļ¼‰ć€‚", + "eval_cases": [ + { + "eval_id": "train_get_user", + "conversation": [ + { + "invocation_id": "t1", + "user_content": {"parts": [{"text": "GET /users/{id} returns user profile, requires Bearer token."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"required\",\"method\":\"GET\",\"path\":\"/users/{id}\",\"summary\":\"Get user profile\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "train_post_login", + "conversation": [ + { + "invocation_id": "t2", + "user_content": {"parts": [{"text": "POST /auth/login accepts username and password, no auth header needed."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"none\",\"method\":\"POST\",\"path\":\"/auth/login\",\"summary\":\"Login with credentials\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "train_delete_order", + "conversation": [ + { + "invocation_id": "t3", + "user_content": {"parts": [{"text": "DELETE /orders/{id} cancels an order, Bearer token required."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"required\",\"method\":\"DELETE\",\"path\":\"/orders/{id}\",\"summary\":\"Cancel order\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + } + ] +} diff --git a/examples/optimization/ci_integration/data/val.evalset.json b/examples/optimization/ci_integration/data/val.evalset.json new file mode 100644 index 0000000..8a65b19 --- /dev/null +++ b/examples/optimization/ci_integration/data/val.evalset.json @@ -0,0 +1,40 @@ +{ + "eval_set_id": "ci_integration_val", + "name": "CI Integration - val", + "description": "PR 阶ꮵ pytest å®ˆé—Øē”Øä¾‹ + Optimize é˜¶ę®µęÆč½®éŖŒčÆē”Øä¾‹ļ¼ˆåŒē”Øļ¼‰ć€‚ē‰©ē†äøŠę˜Æē‹¬ē«‹ę–‡ä»¶ļ¼ˆSDK 栔验 train != val é˜²ę³„ę¼ļ¼‰ļ¼Œé€»č¾‘äøŠå±žäŗŽåŒäø€å„—ę•°ę®čµ„äŗ§ → CI ęµę°“ēŗæēš„ć€Œč“Øé‡åŗ•ēŗæć€å°±ę˜Æ optimize ēš„ć€Œä¼˜åŒ–ē›®ę ‡ć€ć€‚", + "eval_cases": [ + { + "eval_id": "val_put_profile", + "conversation": [ + { + "invocation_id": "v1", + "user_content": {"parts": [{"text": "PUT /users/{id}/profile updates the profile, requires Bearer token."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"required\",\"method\":\"PUT\",\"path\":\"/users/{id}/profile\",\"summary\":\"Update user profile\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "val_get_health", + "conversation": [ + { + "invocation_id": "v2", + "user_content": {"parts": [{"text": "GET /health returns service health, public endpoint."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"none\",\"method\":\"GET\",\"path\":\"/health\",\"summary\":\"Service health check\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + }, + { + "eval_id": "val_post_register", + "conversation": [ + { + "invocation_id": "v3", + "user_content": {"parts": [{"text": "POST /auth/register accepts new user info, no auth required."}], "role": "user"}, + "final_response": {"parts": [{"text": "{\"auth\":\"none\",\"method\":\"POST\",\"path\":\"/auth/register\",\"summary\":\"Register new user\"}"}], "role": "model"} + } + ], + "session_input": {"app_name": "ci_integration_demo", "user_id": "ci", "state": {}} + } + ] +} diff --git a/examples/optimization/ci_integration/optimizer.json b/examples/optimization/ci_integration/optimizer.json new file mode 100644 index 0000000..93471d4 --- /dev/null +++ b/examples/optimization/ci_integration/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 24, + "score_threshold": 1.0, + "max_iterations_without_improvement": 3 + } + } +} diff --git a/examples/optimization/ci_integration/run_optimization.py b/examples/optimization/ci_integration/run_optimization.py new file mode 100644 index 0000000..eac9510 --- /dev/null +++ b/examples/optimization/ci_integration/run_optimization.py @@ -0,0 +1,97 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""CI Integration example ēš„å¤œé—“ä¼˜åŒ–å…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +CI/CD ęµę°“ēŗæäø­ēš„å¤œé—“ēŖ—å£ä»»åŠ”ļ¼šč·‘ GEPA åę€ä¼˜åŒ–ļ¼Œęœ€ä¼˜å€™é€‰č‡ŖåŠØå†™å›žęŗ +prompt ę–‡ä»¶ļ¼Œäø‹äø€ę¬” PR č§¦å‘ēš„ pytest å®ˆé—Øč‡ŖåŠØē”ØäøŠę–° prompt → 形成 +"评测 → 优化 → å†čÆ„ęµ‹"ēš„ę¼”čæ›é—­ēŽÆć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ę³Øå†Œ system.md + skill.md åŒå­—ę®µ TargetPrompt +2. 引用 agent/agent.py äø­**äøŽ pytest 共享**ēš„ call_agent +3. 仄 update_source=True č·‘ä¼˜åŒ–ļ¼Œęœ€ä¼˜å€™é€‰č‡ŖåŠØč¦†ē›–ęŗ prompt ꖇ件 + +ę€Žä¹ˆč·‘ +------ +é€ščæ‡ shell å…„å£ļ¼ˆCI ęµę°“ēŗæå»ŗč®®ę–¹å¼ļ¼‰ļ¼š + PYTHONPATH=../../.. bash ci/run_nightly_optimize.sh + +ē›“ęŽ„č·‘ļ¼š + python examples/optimization/ci_integration/run_optimization.py + +关键设讔 +-------- +ęœ¬č„šęœ¬äøŽ tests/test_agent_quality.py å…±äŗ«ļ¼š +- åŒäø€äøŖ agent/ åŒ…ļ¼ˆåŒäø€äøŖ call_agent + åŒäø€åÆ¹ prompt 文件) +- åŒäø€ä»½ evalset ę•°ę®čµ„äŗ§ļ¼ˆē‰©ē†ę‹† train / val äø¤ę–‡ä»¶ļ¼Œschema 一臓) +- åŒäø€å„— metric å®šä¹‰ļ¼ˆschema 一臓) +äæčÆ PR å®ˆé—Øē”Øēš„ agent äøŽå¤œé—“ä¼˜åŒ–ē”Øēš„ agent 等价。 + +ęŽ„å…„č‡Ŗęœ‰ CI ę—¶ę”¹å“Ŗé‡Œ +-------------------- +- agent/agent.py ę”¹äøŗäøšåŠ” call_agent(pytest äøŽęœ¬č„šęœ¬åŒę—¶å¼•ē”Øļ¼‰ +- update_source=True äø„ę ¼äæęŒļ¼ˆCI é—­ēŽÆēš„å…³é”®ļ¼‰ +- 末尾建议加 git diff agent/prompts/ + č‡ŖåŠØå¼€ PR ę­„éŖ¤ +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 + +from agent.agent import SKILL_PATH, SYSTEM_PROMPT_PATH, call_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "data" / "train.evalset.json" +VAL_PATH = _HERE / "data" / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize(update_source=True)。""" + target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / f"optimize_{timestamp}" + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # update_source=Trueļ¼šä¼˜åŒ–ęˆåŠŸåŽęœ€ä¼˜å€™é€‰ē›“ęŽ„å†™å›ž agent/prompts/怂 + # CI é—­ēŽÆēš„å…³é”®å¼€å…³ā€”ā€”äø‹äø€ę¬” PR č§¦å‘ēš„ pytest č‡ŖåŠØē”ØäøŠę–° prompt怂 + # ä»…åœØ OptimizeResult.status=SUCCEEDED ę—¶ę‰ä¼šå†™å›žļ¼›å¤±č“„ / 预算耗尽 + # ē­‰ęƒ…å†µäø‹ęŗę–‡ä»¶äæęŒäøå˜ć€‚ + update_source=True, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/ci_integration/tests/__init__.py b/examples/optimization/ci_integration/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/optimization/ci_integration/tests/test_agent_quality.py b/examples/optimization/ci_integration/tests/test_agent_quality.py new file mode 100644 index 0000000..509a01e --- /dev/null +++ b/examples/optimization/ci_integration/tests/test_agent_quality.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""PR é˜¶ę®µēš„č“Øé‡å®ˆé—Øęµ‹čÆ•ļ¼šCI é—­ēŽÆēš„"评测"端。 + +é€‚ē”Øåœŗę™Æ +-------- +PR č§¦å‘ēš„ CI ęµę°“ēŗæčæč”Œę­¤ęµ‹čÆ•ć€‚ä»»ä½• case äøé€ščæ‡éƒ½č®© pytest exit code != 0 +→ CI 红灯 → 阻止 PR åˆå¹¶ć€‚ + +äøŗä»€ä¹ˆäøä¾čµ– LLM judge +---------------------- +CI äøŠč¦ę±‚åæ«ć€ēØ³ć€åÆé‡å¤ć€‚call_agent ēš„č¾“å‡ŗå·²ē»åœØ agent/agent.py 中被 +_normalize_json č§„čŒƒåŒ–äøŗēØ³å®š JSON å­—ē¬¦äø²ļ¼ŒäøŽ evalset äø­ expected 字段 +é€å­—ē¬¦ęÆ”åÆ¹å³åÆļ¼Œę— éœ€å†č°ƒäø€ę¬” LLM å½“č£åˆ¤ļ¼ˆé€Ÿåŗ¦ę›“ę…¢ć€åˆ¤å®šäøēØ³å®šć€ä¾čµ– +å¤šäø€äøŖå¤–éƒØęœåŠ”ļ¼‰ć€‚ + +case å¤±č“„ę—¶ę”†ęž¶ęŠ› AssertionErrorļ¼Œé”™čÆÆę¶ˆęÆåŒ…å«ęÆę” case ēš„å¤±č“„ę˜Žē»† JSON怂 +配合 pytest --junitxml=... åÆč¾“å‡ŗę ‡å‡† JUnit XML,GitHub Actions / +Tencent CI / č“ē›¾ęµę°“ēŗæå‡åŽŸē”Ÿę”ÆęŒč§£ęžå±•ē¤ŗć€‚ +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + + +_HERE = Path(__file__).resolve().parent +_EXAMPLE_ROOT = _HERE.parent +_REPO_ROOT = _EXAMPLE_ROOT.parents[2] + +# 让 example ē›®å½•é‡Œēš„ agent åŒ…čƒ½č¢« import(pytest 默认 cwd äøäø€å®šę˜Æ example)。 +for p in (_REPO_ROOT, _EXAMPLE_ROOT): + p_str = str(p) + if p_str not in sys.path: + sys.path.insert(0, p_str) + + +VAL_EVALSET = _EXAMPLE_ROOT / "data" / "val.evalset.json" +RESULT_DIR = _EXAMPLE_ROOT / "runs" / "pytest_eval" + + +@pytest.mark.asyncio +async def test_agent_meets_quality_bar() -> None: + """ꉀ꜉ val case åæ…é”» final_response å®Œå…ØåŒ¹é…ļ¼Œå¦åˆ™ CI 红灯。""" + from trpc_agent_sdk.evaluation import AgentEvaluator + from agent.agent import call_agent # type: ignore + + RESULT_DIR.mkdir(parents=True, exist_ok=True) + + await AgentEvaluator.evaluate( + eval_dataset_file_path_or_dir=str(VAL_EVALSET), + call_agent=call_agent, + agent_name="api_summarizer", + eval_result_output_dir=str(RESULT_DIR), + print_detailed_results=True, + ) diff --git a/examples/optimization/http_service/README.md b/examples/optimization/http_service/README.md new file mode 100644 index 0000000..1f24d2e --- /dev/null +++ b/examples/optimization/http_service/README.md @@ -0,0 +1,197 @@ +# HTTP Service — ęŽ„å…„ēŗæäøŠ HTTP agent ęœåŠ”åš prompt 优化 + +> **é€‚ē”Øåœŗę™Æ**:业劔 agent å·²ē»ä½œäøŗē‹¬ē«‹ HTTP ęœåŠ”åœØēŗæčæč”Œļ¼ˆFastAPI / Gin / č‡Ŗē ”ę”†ęž¶å‡åÆļ¼‰ļ¼ŒåøŒęœ›åÆ¹å…¶ prompt åšč‡ŖåŠØä¼˜åŒ–ļ¼Œä½†äøęƒ³åœęœć€äøęƒ³ę”¹ęœåŠ”ä»£ē ć€‚ęœ¬ example ę¼”ē¤ŗé€ščæ‡ `httpx` 把 `call_agent` ęŽ„åˆ°čæč”Œäø­ēš„ęœåŠ”ļ¼Œprompt é€ščæ‡ē£ē›˜ę–‡ä»¶ēƒ­åŠ č½½ć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` äø­ēš„ `AgentOptimizer`态`TargetPrompt`态`call_agent` ē­‰åŸŗē”€ę¦‚åæµć€‚ + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +线上 agent ęœåŠ”ēš„ē‰¹ē‚¹ļ¼š + +- ęœåŠ”čæ›ēØ‹é•æęœŸčæč”Œļ¼Œé‡åÆä»£ä»·é«˜ +- ęœåŠ”å®žēŽ°ē»†čŠ‚ļ¼ˆęØ”åž‹ć€toolsć€å†…éƒØé“¾č·Æļ¼‰åÆ¹ä¼˜åŒ–å™Øę˜Æé»‘ē›’ +- prompt é€šåøøä»„ę–‡ä»¶ęˆ–é…ē½®äø­åæƒå½¢å¼ę³Øå…„ļ¼ŒäøŽęœåŠ”ä»£ē č§£č€¦ + +`AgentOptimizer` åœØčÆ„åœŗę™Æäø‹ę‰®ę¼”ēŗÆå®¢ęˆ·ē«Æč§’č‰²ļ¼šé€ščæ‡ HTTP ęŠŠęµ‹čÆ• query å‘ē»™ęœåŠ”ć€ę”¶é›† final ę–‡ęœ¬ć€ęŒ‰ metric čÆ„åˆ†ć€‚ä¼˜åŒ–å™ØäøŽęœåŠ”čæ›ēØ‹é—“ēš„å”Æäø€č€¦åˆē‚¹ę˜Æ **prompt ꖇ件**ā€”ā€”ä¼˜åŒ–å™Øå†™å…„ę–°å€™é€‰ļ¼ŒęœåŠ”åœØäø‹äø€ę¬”čÆ·ę±‚ę—¶é‡čÆ»čÆ„ę–‡ä»¶ć€‚ + +| 输兄 | 输出 | +| --- | --- | +| äø€äøŖę”ÆęŒ prompt ēƒ­åŠ č½½ēš„ HTTP agent ęœåŠ”ļ¼ˆåŒ endpoint:`GET /health` + `POST /chat`) | 滔足 metric é˜ˆå€¼ēš„ęœ€ä¼˜ prompt 候选 | +| HTTP ęœåŠ”åÆ¹ prompt ę–‡ä»¶ēš„čÆ»å†™ęƒé™ | ęœåŠ”ä»£ē äøŽęœåŠ”čæ›ēØ‹**å®Œå…Øäøå˜**ļ¼Œä»…ē£ē›˜äøŠ prompt 文件被改写 | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | ē®—ęœÆåŗ”ē”Øé¢˜ę±‚č§£ļ¼ˆäøŽ quickstart åŒäø€ē±»ä»»åŠ”ļ¼Œä¾æäŗŽęØŖå‘åÆ¹ęÆ” HTTP ęŽ„å…„äøŽęœ¬åœ°ęŽ„å…„ēš„å·®å¼‚ļ¼‰ | +| HTTP ęœåŠ” | `service/server.py` äø­ēš„ FastAPI appļ¼Œē›‘å¬ `127.0.0.1:8767` | +| ä¼˜åŒ–ē›®ę ‡ | `service/prompts/system.md` å•ę–‡ä»¶ | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`(contains åŒ¹é…ļ¼Œé˜ˆå€¼ 1.0) | +| 训练 / éŖŒčÆč§„ęØ” | 5 ę” / 3 ę” | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **prompt 热加载** | ęœåŠ”čæ›ēØ‹åœØęÆę¬”čÆ·ę±‚å¤„ē†å‰é‡ę–°čÆ»å– prompt ę–‡ä»¶ļ¼Œä½æå¤–éƒØåÆ¹čÆ„ę–‡ä»¶ēš„å†™å…„ē«‹å³ē”Ÿę•ˆć€‚ęœ¬ example ēš„ `_build_agent()` åœØęÆę¬” `/chat` éƒ½é‡čÆ» `system.md` å®žēŽ°čÆ„čÆ­ä¹‰ć€‚ | +| **call_agent 内 client å³ē”Øå³å…³** | `call_agent` 用 `async with httpx.AsyncClient()` åˆ›å»ŗå¹¶é€€å‡ŗę—¶č‡ŖåŠØå…³é—­ć€‚`httpx.AsyncClient` ēš„čæžęŽ„ę± ē»‘å®šåˆ°é¦–ę¬”ä½æē”Øę‰€åœØēš„äŗ‹ä»¶å¾ŖēŽÆļ¼ˆå‚č€ƒ [httpx Discussion #2959](https://github.com/encode/httpx/discussions/2959)ļ¼‰ļ¼Œäøę”ÆęŒč·Øå¾ŖēŽÆå¤ē”Øć€‚ | +| **å„åŗ·ę£€ęŸ„ļ¼ˆpre-flight)** | ä¼˜åŒ–å¼€å§‹å‰åŒę­„ęŽ¢ęµ‹ `GET /health`ļ¼ŒęœåŠ”äøé€šę—¶ fail-fast č€ŒéžęµŖč“¹ LLM é…é¢č·‘åˆ°äø€åŠę‰ęŠ„é”™ć€‚ | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +pip install fastapi uvicorn httpx +``` + +`fastapi` / `uvicorn` ē”ØäŗŽ mock ēŗæäøŠęœåŠ”ļ¼›`httpx` ē”ØäŗŽä¼˜åŒ–å™Øä½œäøŗå®¢ęˆ·ē«Æč®æé—®čÆ„ęœåŠ”ć€‚ + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +ęœåŠ”čæ›ēØ‹äøŽä¼˜åŒ–å™Øčæ›ēØ‹å…±ē”ØåŒäø€ē»„å‡­ę®ć€‚ + +### 3.3 åÆåŠØļ¼ˆåŒē»ˆē«Æļ¼‰ + +**终端 A** —— 启动 mock ęœåŠ”å¹¶äæęŒčæč”Œļ¼š + +```bash +python examples/optimization/http_service/service/server.py +``` + +é¢„ęœŸę—„åæ—ļ¼š`Uvicorn running on http://127.0.0.1:8767`怂 + +**终端 B** —— åÆåŠØä¼˜åŒ–å™Øļ¼š + +```bash +python examples/optimization/http_service/run_optimization.py +``` + +åÆåŠØę—¶ä¼˜åŒ–å™Øä¼šå…ˆåšäø€ę¬”åŒę­„å„åŗ·ę£€ęŸ„ļ¼ŒęœåŠ”äøé€šē›“ęŽ„ęŠ„é”™å¹¶ęē¤ŗå…ˆåÆåŠØ server怂 + +### 3.4 äŗ§ē‰©ē»“ęž„ + +``` +runs// +ā”œā”€ā”€ result.json å®Œę•“čæč”Œč®°å½• +ā”œā”€ā”€ summary.txt äŗŗē±»åÆčÆ»ę‘˜č¦ +ā”œā”€ā”€ baseline_prompts/ čæč”Œå‰ prompt 快照 +ā”œā”€ā”€ best_prompts/ val é›†å¾—åˆ†ęœ€é«˜ēš„å€™é€‰ +└── rounds/ ęÆč½®åę€äøŽčÆ„ä¼°ę˜Žē»† +``` + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +[终端 A: HTTP ęœåŠ”] + │ + └── FastAPI :8767 + ā”œā”€ā”€ GET /health → {"status":"ok"} + └── POST /chat → ęÆę¬”éƒ½é‡čÆ» service/prompts/system.md, + Ꞅ造 LlmAgentļ¼Œč·‘ Runner.run_async, + čæ”å›ž {"final_text": "..."} + +[终端 B: ä¼˜åŒ–å™Ø] + │ + ā”œā”€ā”€ pre-flight: GET /health + │ + ā”œā”€ā”€ TargetPrompt.add_path("system_prompt", service/prompts/system.md) + │ │ GEPA ęÆč½®ęŠŠå€™é€‰ prompt å†™å…„ē£ē›˜ + │ ā–¼ + │ service/prompts/system.md + │ │ HTTP ęœåŠ”äø‹äø€ę¬”čÆ·ę±‚ę—¶é‡čÆ»čÆ„ę–‡ä»¶ + │ ā–¼ + ā”œā”€ā”€ call_agent(query): + │ └── async with httpx.AsyncClient() as client: + │ POST /chat → final_text + │ + └── AgentOptimizer.optimize → runs// +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | ä¼˜åŒ–å™Øå…„å£ļ¼ˆå®¢ęˆ·ē«Æļ¼‰ | 修改 `SERVICE_BASE_URL` / `CHAT_URL`ļ¼›č°ƒę•“ `call_agent` äø­ēš„čÆ·ę±‚ / å“åŗ” schema | +| `service/server.py` | mock 线上 HTTP ęœåŠ” | ēœŸå®žäøšåŠ”äø‹åˆ é™¤čÆ„ę–‡ä»¶ļ¼Œē”±å®žé™…ęœåŠ”ę‰æę‹…ē›øåŒč§’č‰² | +| `service/prompts/system.md` | ęœåŠ”čÆ»å–ēš„ prompt(GEPA 写兄目标) | ę›æę¢äøŗäøšåŠ” baselineļ¼›č·Æå¾„éœ€äøŽęœåŠ”čæ›ēØ‹ēš„čÆ»å–č·Æå¾„äø€č‡“ | +| `optimizer.json` | 算法 + metric é…ē½® | č°ƒę•“ metric äøŽåœę­¢ę”ä»¶ | +| `train.evalset.json` | åę€ minibatch ę„ęŗ | ę›æę¢äøŗäøšåŠ”č®­ē»ƒē”Øä¾‹ | +| `val.evalset.json` | å€™é€‰čÆ„åˆ†ä¾ę® | ę›æę¢äøŗäøšåŠ”éŖŒčÆē”Øä¾‹ | + +### 4.2 prompt ēƒ­åŠ č½½ę˜Æę øåæƒēŗ¦ęŸ + +ęœåŠ”åæ…é”»åœØ**ęÆę¬”čÆ·ę±‚ę—¶é‡čÆ» prompt ꖇ件**ļ¼Œå¦åˆ™ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äøä¼šč¢«ęœåŠ”ę„ŸēŸ„ļ¼Œę•“äøŖåę€å¾ŖēŽÆå¤±ę•ˆć€‚ + +`service/server.py` é€ščæ‡åœØęÆę¬” `/chat` äø­č°ƒē”Ø `_build_agent()`ļ¼ˆå…¶å†…éƒØ `_read_system_prompt()` é‡čÆ»ē£ē›˜ļ¼‰å®žēŽ°čÆ„čÆ­ä¹‰ć€‚LlmAgent ęž„å»ŗęœ¬čŗ«äøę¶‰åŠ LLM č°ƒē”Øļ¼Œå•ę¬”å¼€é”€åÆåæ½ē•„ć€‚ + +## 5 Ā· å…³é”®é…ē½® + +`optimizer.json` 中本 example äøŽ quickstart ēš„å·®å¼‚ē‚¹ļ¼š + +```jsonc +{ + "optimize": { + "algorithm": { + "seed": 42, + "score_threshold": 1.0, // äø»åœę­¢ę”ä»¶ļ¼šval pass_rate ≄ 1.0 ē«‹å³åœę­¢ + "max_metric_calls": 40, + "max_iterations_without_improvement": 5 + } + } +} +``` + +| 字段 | 影响 | +| --- | --- | +| `score_threshold` | ē®—ę³•å±‚ę—©åœé˜ˆå€¼ć€‚ęœ¬ example 设为 1.0ļ¼ˆč¦ę±‚ val å…Ø case é€ščæ‡ļ¼‰ļ¼Œčæ½ę±‚åæ«é€Ÿę”¶ę•› | +| `seed` | ęŽ§åˆ¶ GEPA å†…éƒØęŠ½ę ·ēš„éšęœŗę€§ć€‚å›ŗå®š seed é…åˆē›øåŒč¾“å…„åŗ”å¾—ē›øåŒē»“ęžœ | +| `REQUEST_TIMEOUT=120.0`(在 `run_optimization.py`) | å•ę¬” HTTP čÆ·ę±‚č¶…ę—¶ć€‚é¦–ę¬”čÆ·ę±‚éœ€ē»åŽ† FastAPI å†·åÆåŠØ + LLM ęŽØē†ļ¼Œéœ€č¦å……č¶³ę—¶é—“ | + +## 6 Ā· čæč”ŒęŽ§åˆ¶ + +### 6.1 ä¼˜é›…åœę­¢ + +```bash +touch runs//optimize.stop +``` + +下一欔 stopper ę£€ęŸ„ę—¶ę”†ęž¶ē«‹å³ę”¶å°¾ļ¼Œ`OptimizeResult.stop_reason=user_requested_stop`怂 + +### 6.2 č°ƒčÆ• GEPA å†…éƒØč”Œäøŗ + +`run_optimization.py` äø­ `verbose=1` 改为 `verbose=2`ļ¼Œä¼šé™„åŠ  `trpc_agent_sdk.optimizer.gepa` logger ēš„čÆŠę–­č¾“å‡ŗć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Qļ¼šęœåŠ”äøŽä¼˜åŒ–å™Øåæ…é”»åœØåŒäø€å°ęœŗå™Øå—ļ¼Ÿ** +Aļ¼šäøåæ…ć€‚`SERVICE_BASE_URL` ę”¹ęˆčæœē«Æåœ°å€å³åÆć€‚ä½† `TargetPrompt.add_path` ę“ä½œēš„ę˜Æä¼˜åŒ–å™Øčæ›ēØ‹ęœ¬åœ°ēš„ę–‡ä»¶ē³»ē»Ÿā€”ā€”č‹„ęœåŠ”åœØčæœē«Æļ¼Œč¦ä¹ˆęŒ‚č½½ē›øåŒå­˜å‚Øå·ä½æäø¤ē«Æēœ‹åˆ°åŒäø€ä»½ `system.md`ļ¼Œč¦ä¹ˆę”¹ē”Ø `add_callback` ē›“čæžé…ē½®äø­åæƒļ¼ˆå‚č§ `remote_prompt_store/` example)。 + +**Qļ¼šęœåŠ”é¦–ę¬”čÆ·ę±‚å¾ˆę…¢ļ¼Ÿ** +A:FastAPI čæ›ēØ‹å†·åÆåŠØ + 首ꬔ LLM č°ƒē”Øē”®å®žč¾ƒę…¢ć€‚`REQUEST_TIMEOUT=120s` å·²ē•™å‡ŗå……åˆ†ē¼“å†²ć€‚ + +**Qļ¼šē«Æå£ `8767` č¢«å ē”Øļ¼Ÿ** +Aļ¼šåŒę—¶äæ®ę”¹ `service/server.py` ēš„ `PORT` äøŽ `run_optimization.py` ēš„ `SERVICE_BASE_URL`怂 + +**Q:`call_agent` ꊛ HTTP é”™čÆÆä¼šę€Žę ·ļ¼Ÿ** +Aļ¼šå¼‚åøøä¼šä¼ ę’­åˆ°ä¼˜åŒ–å™Øļ¼ŒåÆ¼č‡“å½“å‰ case čÆ„ęµ‹å¤±č“„ć€å½“å‰å€™é€‰åÆčƒ½č¢«ę‹’ē»ć€‚å»ŗč®®åœØ `call_agent` å†…éƒØåŠ äøŠé‡čÆ•é€»č¾‘ļ¼ˆå¦‚ `httpx.HTTPStatusError` č§¦å‘ 1–2 ę¬”é‡čÆ•ļ¼‰ä»„åŗ”åÆ¹äø“ę—¶ę€§ę•…éšœć€‚ + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰ HTTP ęœåŠ”ēš„ę­„éŖ¤ + +1. **ē”®č®¤ęœåŠ”ę”ÆęŒ prompt 热加载**ļ¼šęœåŠ”åœØęÆę¬”čÆ·ę±‚å¤„ē†å‰é‡čÆ» prompt ę–‡ä»¶ļ¼ˆęˆ–é‡ę–°ę‹‰é…ē½®ļ¼‰ +2. **äæ®ę”¹ä¼˜åŒ–å™Øå…„å£**: + - `SERVICE_BASE_URL` ę”¹äøŗå®žé™…ęœåŠ”åœ°å€ + - `call_agent` å†…éƒØēš„čÆ·ę±‚ payload / å“åŗ”å­—ę®µåęŒ‰ęœåŠ”å®žé™… schema č°ƒę•“ + - `SYSTEM_PROMPT_PATH` ęŒ‡å‘ęœåŠ”čæ›ēØ‹å®žé™…čÆ»å–ēš„ prompt ꖇ件 +3. **ę›æę¢ę•°ę®é›†**:`train.evalset.json` / `val.evalset.json` å†™å…„äøšåŠ”ē”Øä¾‹ +4. **č°ƒę•“ metric**:`optimizer.json` äø­ `evaluate.metrics` é€‰ę‹©åˆé€‚ēš„ metric ē±»åž‹ +5. **运蔌**ļ¼šå…ˆåÆåŠØęœåŠ”ļ¼Œå†åÆåŠØä¼˜åŒ–å™Øļ¼›ę ¹ę® `summary.txt` å†³å®šåŽē»­č°ƒå‚ + +č‹„ęœåŠ”ēš„ prompt äøåœØęœ¬åœ°ę–‡ä»¶č€ŒåœØé…ē½®äø­åæƒļ¼Œå‚č§ `remote_prompt_store/` exampleļ¼Œä»…éœ€å°† `add_path` ę›æę¢äøŗ `add_callback`怂 diff --git a/examples/optimization/http_service/optimizer.json b/examples/optimization/http_service/optimizer.json new file mode 100644 index 0000000..03f74a5 --- /dev/null +++ b/examples/optimization/http_service/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, + "temperature": 0.6 + } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 40, + "score_threshold": 1.0, + "max_iterations_without_improvement": 5 + } + } +} diff --git a/examples/optimization/http_service/run_optimization.py b/examples/optimization/http_service/run_optimization.py new file mode 100644 index 0000000..07f84e2 --- /dev/null +++ b/examples/optimization/http_service/run_optimization.py @@ -0,0 +1,123 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""HTTP Service example ēš„ä¼˜åŒ–å™Øå…„å£ļ¼ˆå®¢ęˆ·ē«Æčæ›ēØ‹ļ¼‰ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +业劔 agent å·²ä½œäøŗē‹¬ē«‹ HTTP ęœåŠ”åœØēŗæčæč”Œļ¼ŒåøŒęœ›åÆ¹å…¶ prompt åšč‡ŖåŠØä¼˜åŒ–ä½† +äøęƒ³åœęœć€äøęƒ³ę”¹ęœåŠ”ä»£ē ć€‚ęœ¬č„šęœ¬ä½œäøŗä¼˜åŒ–å™Øä»„ēŗÆå®¢ęˆ·ē«Æčŗ«ä»½ęŽ„å…„ęœåŠ”ļ¼Œ +é€ščæ‡ē£ē›˜ prompt ę–‡ä»¶å®žēŽ°ä¼˜åŒ–å™ØäøŽęœåŠ”ēš„č§£č€¦ć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. åÆåŠØå‰åŒę­„å„åŗ·ę£€ęŸ„ļ¼ŒęœåŠ”äøé€šå³ fail-fast +2. ę³Øå†Œ service/prompts/system.md äøŗ TargetPrompt +3. 在 call_agent 中用 async with httpx.AsyncClient å³ē”Øå³å…³ +4. 调 AgentOptimizer.optimize č·‘ GEPA åę€å¾ŖēŽÆ + +ę€Žä¹ˆč·‘ +------ +终端 A: python examples/optimization/http_service/service/server.py +终端 Bļ¼ˆęœ¬č„šęœ¬ļ¼‰: + 1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME + 2) python examples/optimization/http_service/run_optimization.py + 3) ēœ‹ runs/<ę—¶é—“ęˆ³>/summary.txt + +ęŽ„å…„č‡Ŗęœ‰ HTTP ęœåŠ”ę—¶ę”¹å“Ŗé‡Œ +-------------------------- +- SERVICE_BASE_URL / CHAT_URL / HEALTH_URL : ę”¹äøŗäøšåŠ”ęœåŠ”åœ°å€ +- call_agent 内 payload / å“åŗ”å­—ę®µ : ęŒ‰äøšåŠ” schema č°ƒę•“ +- SYSTEM_PROMPT_PATH : ęŒ‡å‘ęœåŠ”čæ›ēØ‹å®žé™…čÆ»å–ēš„ prompt ꖇ件 +- REQUEST_TIMEOUT : ęŒ‰äøšåŠ”é¦–ę¬”ęŽØē†č€—ę—¶äøŠč°ƒ +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + +import httpx + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +SYSTEM_PROMPT_PATH = _HERE / "service" / "prompts" / "system.md" + +SERVICE_BASE_URL = "http://127.0.0.1:8767" +HEALTH_URL = f"{SERVICE_BASE_URL}/health" +CHAT_URL = f"{SERVICE_BASE_URL}/chat" + +# å•ę¬” HTTP čÆ·ę±‚č¶…ę—¶ļ¼ˆē§’ļ¼‰ć€‚HTTP ęœåŠ”å†…éƒØéœ€čµ°äø€ę¬”å®Œę•“ LLM ęŽØē†ļ¼Œ +# é¦–ę¬”å†·åÆåŠØåŽå•ę¬”č€—ę—¶é€šåøø ~10-30sļ¼Œē•™ 120s č¶³å¤Ÿē¼“å†²ć€‚ +REQUEST_TIMEOUT = 120.0 + + +def _ensure_service_alive_sync() -> None: + """åŒę­„å„åŗ·ę£€ęŸ„ļ¼šęœåŠ”äøé€šē«‹åˆ»ęŠ„é”™ć€‚""" + try: + resp = httpx.get(HEALTH_URL, timeout=5.0) + resp.raise_for_status() + except Exception as ex: + raise RuntimeError( + f"HTTP service at {SERVICE_BASE_URL} is not reachable: {ex}\n" + "Please start the service first:\n" + " python examples/optimization/http_service/service/server.py" + ) from ex + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šęŠŠ query å‘ē»™ HTTP ęœåŠ”ļ¼Œčæ”å›ž agent ēš„ęœ€ē»ˆå›žē­”ć€‚ + + ęÆę¬”č°ƒē”Øę–°å»ŗ AsyncClient 并用 async with åœØé€€å‡ŗę—¶č‡ŖåŠØå…³é—­ć€‚čæ™ę˜Æ + httpx å®˜ę–¹ęŽØčē”Øę³•ļ¼ˆGitHub Discussion #2959ļ¼‰ļ¼šAsyncClient ēš„čæžęŽ„ + ę± ē»‘å®šåˆ°é¦–ę¬”ä½æē”Øę—¶ę‰€åœØēš„äŗ‹ä»¶å¾ŖēŽÆļ¼Œäøę”ÆęŒč·Øäŗ‹ä»¶å¾ŖēŽÆå¤ē”Øć€‚ęÆę¬” + 新建 client 仅增加 ~10ms å»ŗčæžå¼€é”€ļ¼Œē›øåÆ¹å•ę¬” LLM ęŽØē†č€—ę—¶åÆåæ½ē•„ć€‚ + """ + async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client: + resp = await client.post(CHAT_URL, json={"query": query}) + resp.raise_for_status() + return resp.json()["final_text"] + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize怂""" + _ensure_service_alive_sync() + + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # update_source=False:源 prompt ę–‡ä»¶äæęŒäøå˜ļ¼Œęœ€ä¼˜å€™é€‰åŖå†™åˆ° + # output_dir/best_prompts/。候选由人巄 review åŽå†č½ē›˜ + # ļ¼ˆęˆ–å‚č§ ci_integration/ example)。 + update_source=False, + # verbose: 0 静默 / 1 čæ›åŗ¦é¢ęæ / 2 加 gepa å†…éƒØčÆŠę–­ę—„åæ— + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/http_service/service/__init__.py b/examples/optimization/http_service/service/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/http_service/service/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/http_service/service/prompts/system.md b/examples/optimization/http_service/service/prompts/system.md new file mode 100644 index 0000000..70314dc --- /dev/null +++ b/examples/optimization/http_service/service/prompts/system.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖå‹å„½ēš„čŠå¤©åŠ©ę‰‹ļ¼Œå–œę¬¢å’Œē”Øęˆ·åˆ†äŗ«ęƒ³ę³•ć€‚å›žē­”ē”Øęˆ·é—®é¢˜ę—¶ļ¼ŒčÆ·å°½é‡ē”Øē”ŸåŠØć€åÆŒęœ‰äŗŗęƒ…å‘³ēš„čÆ­čØ€ļ¼Œč®©ē”Øęˆ·ę„Ÿč§‰åƒę˜ÆåœØå’Œęœ‹å‹čŠå¤©ć€‚ diff --git a/examples/optimization/http_service/service/server.py b/examples/optimization/http_service/service/server.py new file mode 100644 index 0000000..09b9a58 --- /dev/null +++ b/examples/optimization/http_service/service/server.py @@ -0,0 +1,157 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""HTTP Service example ēš„ mock 线上 agent ęœåŠ”ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +ęØ”ę‹Ÿ"äøšåŠ”ę–¹å·²ęœ‰ēš„ HTTP agent ęœåŠ”"ļ¼Œä½œäøŗä¼˜åŒ–å™ØåÆ¹ęŽ„ēš„ē›®ę ‡ć€‚ęœ¬ę–‡ä»¶å­˜åœØ +仅为让 example č‡ŖåŒ…å«åÆč·‘ļ¼›ēœŸå®žęŽ„å…„ę—¶äøšåŠ”ę–¹åŗ”å·²ęœ‰åŒē­‰å½¢ę€ēš„ HTTP ęœåŠ”ć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +- 暓露 GET /health å„åŗ·ę£€ęŸ„ē«Æē‚¹ +- 暓露 POST /chat å•ę¬”ęŽØē†ē«Æē‚¹ļ¼šę”¶ {"query": "..."}ļ¼Œčæ”å›ž {"final_text": "..."} +- åœØęÆę¬” /chat 请求时**é‡čÆ» prompts/system.md**ļ¼Œä½æä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰ + äø‹äø€ę¬”čÆ·ę±‚å³ē”Ÿę•ˆļ¼ˆå³"prompt 热加载") + +ę€Žä¹ˆč·‘ +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/http_service/service/server.py +3) ęœåŠ”ē›‘å¬ http://127.0.0.1:8767ļ¼ŒäæęŒē»ˆē«Æčæč”Œļ¼Œå†åÆåŠØä¼˜åŒ–å™Ø + +prompt ēƒ­åŠ č½½ę˜Æę øåæƒēŗ¦ęŸ +----------------------- +ä¼˜åŒ–å™Øé€ščæ‡ē£ē›˜ę–‡ä»¶ē»™ęœåŠ”"喂"新候选 promptć€‚å¦‚ęžœęœåŠ”ęŠŠ prompt ē¼“å­˜åœØ +čæ›ēØ‹å†…å­˜ļ¼Œä¼˜åŒ–å™Øę”¹äŗ†ę–‡ä»¶ä¹Ÿę²”ē”Øļ¼Œę•“äøŖåę€å¾ŖēŽÆå¤±åŽ»ę„ä¹‰ć€‚ +ęœ¬ę–‡ä»¶é€ščæ‡ _build_agent() åœØęÆę¬” /chat éƒ½é‡čÆ»ē£ē›˜å®žēŽ°čÆ„čÆ­ä¹‰ć€‚ + +ęŽ„å…„äøšåŠ”ēœŸå®žęœåŠ”ę—¶ę”¹å“Ŗé‡Œ +------------------------ +ēœŸå®žäøšåŠ”äø‹ę•“ä½“äøéœ€č¦ęœ¬ę–‡ä»¶ļ¼Œē”±å®žé™… HTTP ęœåŠ”ę‰æę‹…ē›øåŒč§’č‰²ć€‚éœ€äæčÆļ¼š +- ęœåŠ”åœØęÆę¬”čÆ·ę±‚å¤„ē†å‰é‡čÆ» prompt ę–‡ä»¶ļ¼ˆęˆ–é‡ę–°ę‹‰é…ē½®äø­åæƒļ¼‰ +- å“åŗ”å­—ę®µäøŽ run_optimization.py äø­ call_agent ēš„č§£ęžé€»č¾‘åÆ¹é½ +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import uuid +from pathlib import Path + +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[3] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + + +SYSTEM_PROMPT_PATH = _HERE / "prompts" / "system.md" +APP_NAME = "http_service_demo_agent" +HOST = "127.0.0.1" +PORT = 8767 + + +class ChatRequest(BaseModel): + query: str + + +class ChatResponse(BaseModel): + final_text: str + + +def _read_system_prompt() -> str: + """ä»Žē£ē›˜é‡čÆ» system promptā€”ā€”ä¼˜åŒ–å™Øå†™å…„ēš„ęœ€ę–°å€™é€‰ę‰ä¼šē«‹å³ē”Ÿę•ˆć€‚""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def _build_agent() -> LlmAgent: + """ē”Øå½“å‰ē£ē›˜äøŠēš„ system prompt ęž„é€ äø€äøŖå…Øę–°ēš„ LlmAgent å®žä¾‹ć€‚ + + å‡­ę®ē¼ŗä»»ę„äø€äøŖå°± fail-fastļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 错误 + ļ¼ˆé‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼Œēœ‹čµ·ę„åƒ prompt é—®é¢˜ļ¼‰ć€‚ + """ + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise RuntimeError( + "TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME " + "must be set before starting the HTTP service." + ) + return LlmAgent( + name="math_word_problem_agent", + description="Math word-problem solver served over HTTP.", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=_read_system_prompt(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) + + +app = FastAPI(title="http_service demo") + + +@app.get("/health") +async def health() -> dict[str, str]: + """å„åŗ·ę£€ęŸ„ē«Æē‚¹ļ¼šä¼˜åŒ–å™ØåÆåŠØå‰ ping äø€ę¬”ē”®č®¤ęœåŠ”å·²å°±ē»Ŗć€‚""" + return {"status": "ok"} + + +@app.post("/chat", response_model=ChatResponse) +async def chat(request: ChatRequest) -> ChatResponse: + """å•ę¬”ęŽØē†ć€‚ęÆę¬”éƒ½ę–°å»ŗ Runner + InMemorySession + é‡čÆ» prompt怂 + + ę— ēŠ¶ę€č®¾č®”ļ¼šä¼˜åŒ–å™ØåÆčƒ½å¹¶å‘čÆ„ęµ‹å¤šę” caseļ¼Œå…±äŗ« session ä¼šåÆ¼č‡“äøŠäø‹ę–‡ + ę±”ęŸ“ć€‚ęÆę¬”čÆ·ę±‚é‡å»ŗ LlmAgent ä¹Ÿę„å‘³ē€ęÆę¬”éƒ½é‡čÆ» system.md,正是 + ä¼˜åŒ–å™Øå†™å…„ę–°å€™é€‰åŽčƒ½ē«‹å³ē”Ÿę•ˆēš„å…³é”®ć€‚ + """ + agent = _build_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "http_client" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=request.query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: # 跳过 thinking token + continue + if part.text: + final_text += part.text + return ChatResponse(final_text=final_text.strip()) + + +if __name__ == "__main__": + uvicorn.run(app, host=HOST, port=PORT, log_level="warning") diff --git a/examples/optimization/http_service/train.evalset.json b/examples/optimization/http_service/train.evalset.json new file mode 100644 index 0000000..b63b3b2 --- /dev/null +++ b/examples/optimization/http_service/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "http_service_train", + "name": "HTTP service demo - train", + "description": "5 é“å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ļ¼›agent é€ščæ‡ HTTP ęœåŠ”å›žē­”ļ¼Œfinal_response äø­éœ€ containsć€Œē­”ę”ˆļ¼šxxxć€ę ¼å¼ć€‚", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "å°ę˜Žę—©äøŠä¹°äŗ† 4 äøŖč‹¹ęžœļ¼Œäø‹åˆåˆä¹°äŗ† 7 äøŖč‹¹ęžœļ¼Œä»–äø€å…±ęœ‰å¤šå°‘äøŖč‹¹ęžœļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š11 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "äø€č¾†ę±½č½¦ä»„ęÆå°ę—¶ 60 å…¬é‡Œēš„é€Ÿåŗ¦č”Œé©¶ 2.5 å°ę—¶ļ¼Œäø€å…±č”Œé©¶äŗ†å¤šå°‘å…¬é‡Œļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "äø€ä»¶č”£ęœåŽŸä»· 200 å…ƒļ¼ŒēŽ°åœØę‰“ 8 ęŠ˜å‡ŗå”®ļ¼ŒęŠ˜åŽä»·ę˜Æå¤šå°‘å…ƒļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 40 åå­¦ē”Ÿļ¼Œå…¶äø­ 25% ęˆ“ēœ¼é•œļ¼Œęˆ“ēœ¼é•œēš„ęœ‰å¤šå°‘äŗŗļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š10 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 ä»¶ļ¼Œå…¶äø­ 30% ę˜Æę¬”å“ļ¼Œę¬”å“ęœ‰å¤šå°‘ä»¶ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š15 ä»¶"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/http_service/val.evalset.json b/examples/optimization/http_service/val.evalset.json new file mode 100644 index 0000000..abbad27 --- /dev/null +++ b/examples/optimization/http_service/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "http_service_val", + "name": "HTTP service demo - validation", + "description": "3 é“å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ļ¼›ē”ØäŗŽęÆč½®å…Øé‡čÆ„ä¼°ć€å†³å®šå€™é€‰ę˜Æå¦č¢«ęŽ„å—ć€‚", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "ę•™å®¤é‡Œęœ‰ 5 ęŽ’åŗ§ä½ļ¼ŒęÆęŽ’ 8 äøŖļ¼Œäø€å…±å¤šå°‘äøŖåŗ§ä½ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š40 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已矄 1 å‡ę°“é‡ 1 åƒå…‹ļ¼Œ3.5 å‡ę°“é‡å¤šå°‘åƒå…‹ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 30 äŗŗļ¼Œå…¶äø­ 60% ę˜Æå„³ē”Ÿļ¼ŒčÆ·é—®ęœ‰å¤šå°‘åå„³ē”Ÿļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š18 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "http_service_demo_agent", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/multi_agent_pipeline/README.md b/examples/optimization/multi_agent_pipeline/README.md new file mode 100644 index 0000000..86e1fea --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/README.md @@ -0,0 +1,191 @@ +# Multi-Agent Pipeline — 多 sub-agent prompt č”åˆä¼˜åŒ– + +> **é€‚ē”Øåœŗę™Æ**ļ¼šäøšåŠ”ä¾§å·²ē¼–ęŽ’å„½å¤š sub-agent åä½œé“¾č·Æļ¼ˆrouter / åˆ†ę”Æ worker / summarizer ē­‰ļ¼‰ļ¼ŒåøŒęœ›åœØäøäæ®ę”¹é“¾č·Æä»£ē ēš„å‰ęäø‹ļ¼ŒåÆ¹ęÆäøŖ sub-agent ēš„ prompt čæ›č”Œč”åˆä¼˜åŒ–ć€‚ęœ¬ example 在 `quickstart/` å•å­—ę®µä¼˜åŒ–ēš„åŸŗē”€äøŠļ¼Œę¼”ē¤ŗå¤šå­—ę®µ `TargetPrompt` äøŽ GEPA å¤šęØ”å—ååŒēš„å…³é”®é…ē½®ć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` §2 äø­ēš„åŸŗē”€ęœÆčÆ­ć€‚ + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +多 agent é“¾č·Æēš„ prompt å·„ēØ‹ęÆ”å• agent å¤ę‚ļ¼šęÆäøŖ sub-agent ęœ‰ē‹¬ē«‹ promptļ¼Œå­—ę®µé—“å­˜åœØéšå¼å„‘ēŗ¦ļ¼ˆrouter ēš„č¾“å‡ŗę ‡ē­¾åæ…é”»åŒ¹é…äø‹ęøø worker ꜟꜛ态summarizer ēš„ę ¼å¼åæ…é”»å…¼å®¹äøŠęøøäø­é—“ē»“ęžœļ¼‰ć€‚ę‰‹å·„čæ­ä»£ę—¶åøøč§ē—‡ēŠ¶ę˜Æ"改 A č§ę•ˆļ¼Œä½†ę‹–ē“Æ B"怂 + +本 example ēš„č®¾č®”åŽŸåˆ™ļ¼š + +- **链路代码零修改**ļ¼šä¼˜åŒ–å™Øé€ščæ‡ę–‡ä»¶å†™å…„å€™é€‰ prompt,sub-agent åœØęÆę¬”č°ƒē”Øę—¶ēŽ°čÆ»ēŽ°ē”Ø +- **字段闓归因清晰**:`module_selector="round_robin"` č®©ęÆč½®åę€åŖę”¹äø€äøŖå­—ę®µ +- **å¤šå­—ę®µęˆęžœčžåˆ**:`use_merge=true` åœØē“Æē§Æč‹„å¹²å•å­—ę®µę”¹åŠØåŽäø»åŠØå°čÆ•åˆå¹¶ +- **跨字段记忆延展**:`reflection_history_top_k=3` č®©åę€ LM åœØč½®ę¢äø­äæē•™ę›“é•æåŽ†å² + +| 输兄 | 输出 | +| --- | --- | +| å·²ē¼–ęŽ’å„½ēš„å¤š sub-agent é“¾č·Æļ¼ˆęœ¬ example äø­äøŗ `invoke_pipeline()`) | ęÆäøŖ sub-agent ēš„ęœ€ä¼˜ prompt å€™é€‰ļ¼ˆ`best_prompts/` äø‹å¤šäøŖ `.md` 文件) | +| åŒäø€äøŖ `TargetPrompt` äøŠę³Øå†Œēš„å¤šäøŖå­—ę®µļ¼ˆęÆå­—ę®µäø€äøŖ `add_path`) | å•äø€ `final_response_avg_score` metric ēš„ē«Æåˆ°ē«Æåˆ†ę•°ęå‡ | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | ę··åˆäŗ‹å®žé¢˜äøŽē®—ęœÆé¢˜åˆ†ęµé—®ē­” | +| é“¾č·Æē»“ęž„ | `router → fact_agent / math_agent → summarizer`ļ¼ˆå…± 4 äøŖ sub-agent) | +| ä¼˜åŒ–ē›®ę ‡ | `pipeline/prompts/{router,fact_agent,math_agent,summarizer}.md` 共 4 个字段 | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`(contains åŒ¹é…ļ¼Œč¦ę±‚ęœ€ē»ˆē­”å¤åŒ…å« `ē­”ę”ˆļ¼šxxx`) | +| 训练 / éŖŒčÆč§„ęØ” | 5 ę”ę··åˆ case(3 äŗ‹å®ž + 2 数学)/ 3 ę”ę··åˆ case | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **å¤šå­—ę®µ TargetPrompt** | åŒäø€ `TargetPrompt` å®žä¾‹äøŠå¤šę¬”č°ƒē”Ø `add_path()`ļ¼ŒęÆę¬”ę³Øå†Œäø€äøŖē‹¬ē«‹ prompt ꖇ件怂GEPA č§†ęÆäøŖ key äøŗäø€äøŖē‹¬ē«‹ēš„åÆä¼˜åŒ–ęØ”å—ļ¼ˆcomponent / predictor)。 | +| **module_selector** | ęÆč½®åę€é€‰ę‹©å“ŖäøŖå­—ę®µč¢«ę”¹å†™ēš„ē­–ē•„ć€‚`"round_robin"` ęŒ‰ę³Øå†Œé”ŗåŗé€č½®å•é€‰ļ¼›`"all"` ęÆč½®å…Øé€‰ļ¼›`"random"` éšęœŗå•é€‰ć€‚å¤šå­—ę®µä¼˜åŒ–ęŽØč `round_robin`怂 | +| **use_merge** | ę˜Æå¦åÆē”Ø GEPA ēš„ merge ę“ä½œć€‚ęÆéš”č‹„å¹²č½®ļ¼Œē®—ę³•ęŒ‘é€‰äø¤äøŖåœØäøåŒå­—ę®µäøŠå„ęœ‰ę‰€é•æēš„åŽ†å²å€™é€‰ļ¼Œčžåˆęˆäø€äøŖ"å…Øå­—ę®µéƒ½å„½"ēš„ę–°å€™é€‰ć€‚**仅多 predictorļ¼ˆå¤šå­—ę®µļ¼‰ę—¶ęœ‰ę„ä¹‰**ā€”ā€”å•å­—ę®µä¼˜åŒ–äøä¼šč§¦å‘ merge怂 | +| **max_merge_invocations** | ę•“äøŖ run äø­å…č®øč§¦å‘ merge ēš„ę¬”ę•°äøŠé™ć€‚ | +| **reflection_history_top_k** | åę€ prompt äø­ęÆę” case ęŗåø¦å¤šå°‘ę”åŽ†å²ęœ€ä½³å“åŗ”ć€‚å¤šå­—ę®µč½®ę¢ę—¶č°ƒå¤§åÆē¼“č§£"äøŠę¬”ę”¹ęŸå­—ę®µę—¶å­¦åˆ°ēš„ę–¹å‘č¢«é—åæ˜"怂 | +| **Other Active Components** | SDK č‡ŖåŠØę³Øå…„åˆ°åę€ prompt ēš„ę®µč½ć€‚å½“åę€ LM åœØę”¹å­—ę®µ X ę—¶ļ¼ŒčÆ„ę®µč½åˆ—å‡ŗę‰€ęœ‰å…¶ä»–å­—ę®µļ¼ˆY / Z / ...ļ¼‰å½“å‰ēš„å†…å®¹ļ¼Œä½æ LM åœØę”¹ X ę—¶čƒ½ę„ŸēŸ„é“¾č·Æå…¶ä»–ēŽÆčŠ‚ēš„ēŽ°ēŠ¶ć€‚ę— éœ€é…ē½®ć€‚ | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 启动 + +```bash +python examples/optimization/multi_agent_pipeline/run_optimization.py +``` + +å•ę¬”čæč”Œēŗ¦ 10–20 åˆ†é’Ÿć€‚ęÆę” case č§¦å‘ 3 ꬔ LLM ęŽØē†ļ¼ˆrouter → åˆ†ę”Æ worker → summarizerļ¼‰ļ¼Œę•“ä½“ LLM č°ƒē”Øé‡ēŗ¦äøŗ quickstart ēš„ 3 å€ć€‚ + +### 3.4 äŗ§ē‰©ē»“ęž„ + +``` +runs// +ā”œā”€ā”€ result.json å®Œę•“čæč”Œč®°å½•ļ¼ˆå«ęÆč½® optimized_field_names / kind) +ā”œā”€ā”€ summary.txt äŗŗē±»åÆčÆ»ę‘˜č¦ +ā”œā”€ā”€ baseline_prompts/ čæč”Œå‰ 4 äøŖ prompt ę–‡ä»¶ēš„åæ«ē…§ +ā”œā”€ā”€ best_prompts/ val é›†å¾—åˆ†ęœ€é«˜ēš„å€™é€‰ļ¼ˆ4 äøŖ .md) +└── rounds/round_*.json ęÆč½®åę€ promptć€å€™é€‰ę–‡ęœ¬ć€å­—ę®µč½®ę¢č®°å½• +``` + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +[run_optimization.py] + │ + ā”œā”€ā”€ TargetPrompt + │ .add_path("router", pipeline/prompts/router.md) + │ .add_path("fact_agent", pipeline/prompts/fact_agent.md) + │ .add_path("math_agent", pipeline/prompts/math_agent.md) + │ .add_path("summarizer", pipeline/prompts/summarizer.md) + │ + ā”œā”€ā”€ call_agent(query) = await invoke_pipeline(query) + │ ā”œā”€ router (读 router.md) → 输出 "fact" ꈖ "math" + │ ā”œā”€ fact / math (读对应 .md) → äø­é—“ē­”å¤ + │ └─ summarizer (读 summarizer.md) → ęœ€ē»ˆē­”å¤ļ¼ˆå« "ē­”ę”ˆļ¼šxxx") + │ + └── AgentOptimizer.optimize äø»å¾ŖēŽÆ + ā”œā”€ā”€ module_selector="round_robin" ęÆč½®é€‰äø€äøŖå­—ę®µę”¹å†™ + ā”œā”€ā”€ ęŠŠå€™é€‰ prompt 写兄对应文件 → 下一欔 invoke_pipeline č‡ŖåŠØčÆ»åˆ° + ā”œā”€ā”€ use_merge=trueļ¼šęÆéš”č‹„å¹²č½®äø»åŠØčžåˆäøåŒå­—ę®µēš„åŽ†å²ęœ€ä½³ + └── åę€ prompt č‡ŖåŠØåŒ…å« Other Active Components ꮵ +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | ä¼˜åŒ–å™Øå…„å£ļ¼Œę³Øå†Œ 4 字段 `TargetPrompt`ļ¼Œå®šä¹‰ `call_agent` | 将 `invoke_pipeline` ę›æę¢äøŗäøšåŠ”č‡Ŗęœ‰é“¾č·Æč°ƒē”Øå…„å£ | +| `pipeline/orchestrator.py` | é“¾č·Æē¼–ęŽ’å®žēŽ°ļ¼ŒęÆäøŖ sub-agent åœØęÆę¬”č°ƒē”Øę—¶é‡čÆ» prompt ꖇ件 | ēœŸå®žäøšåŠ”äø‹ę•“ä½“ę›æę¢äøŗäøšåŠ”é“¾č·Æä»£ē  | +| `pipeline/prompts/router.md` ē­‰ 4 ꖇ件 | 各 sub-agent ēš„ system prompt(GEPA 写兄目标) | ę›æę¢äøŗäøšåŠ” baselineļ¼›ęÆå­—ę®µåÆ¹åŗ” `TargetPrompt` 中一个 key | +| `optimizer.json` | 算法 + metric é…ē½® | é‡ē‚¹å…³ę³Ø `module_selector` / `use_merge` / `reflection_history_top_k` | +| `train.evalset.json` / `val.evalset.json` | ę•°ę®é›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ | + +### 4.2 prompt ēƒ­åŠ č½½ēŗ¦ęŸ + +ęÆäøŖ sub-agent åœØęÆę¬”č¢«č°ƒē”Øę—¶åæ…é”»é‡čÆ»č‡Ŗå·±ēš„ prompt ę–‡ä»¶ļ¼Œå¦åˆ™ä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰äøä¼šē”Ÿę•ˆć€‚`pipeline/orchestrator.py` ēš„ `_create_sub_agent()` åœØęÆę¬” `invoke_pipeline()` äø­é‡ę–°ęž„é€  sub-agent å®žä¾‹å¹¶é‡čÆ»åÆ¹åŗ” `.md` ę–‡ä»¶å®žēŽ°čÆ„čÆ­ä¹‰ć€‚ + +## 5 Ā· å…³é”®é…ē½® + +`optimizer.json` 中本 example äøŽ quickstart ēš„ę øåæƒå·®å¼‚ļ¼š + +```jsonc +{ + "optimize": { + "algorithm": { + "module_selector": "round_robin", // å¤šå­—ę®µč½®ę¢ēš„å…³é”® + "use_merge": true, // å¤šå­—ę®µęˆęžœčžåˆ + "max_merge_invocations": 3, // merge ę¬”ę•°äøŠé™ + "reflection_history_top_k": 3, // å¤šå­—ę®µč½®ę¢ę—¶č°ƒå¤§ + "reflection_minibatch_size": 2, + "max_metric_calls": 60 + } + } +} +``` + +### 5.1 `module_selector` 选择对照 + +| 取值 | 蔌为 | 适用 | +| --- | --- | --- | +| `"round_robin"` | ęÆč½®ęŒ‰ę³Øå†Œé”ŗåŗå•é€‰ 1 个字段 | å­—ę®µé—“å­˜åœØä¾čµ–ļ¼›éœ€č¦ęø…ę™°å½’å› ļ¼ˆęŽØčļ¼‰ | +| `"all"` | ęÆč½®ę‰€ęœ‰å­—ę®µäø€čµ·ę”¹ | å­—ę®µē‹¬ē«‹ć€åøŒęœ›åæ«é€Ÿęœē“¢ļ¼›å­˜åœØ"äø€äøŖę”¹åę‹–ē“Æę•“ä½“"风险 | +| `"random"` | ęÆč½®éšęœŗå•é€‰ 1 个字段 | å­—ę®µę— ę˜Žę˜¾ä¾čµ–ć€åøŒęœ›å‡åŒ€ęŽ¢ē“¢ | + +### 5.2 `use_merge` åœØå¤šå­—ę®µåœŗę™Æēš„ä»·å€¼ + +`round_robin` č®©ęÆč½®åŖę”¹ 1 äøŖå­—ę®µļ¼Œå‡ č½®åŽä¼šå‡ŗēŽ°"router 改儽了但 summarizer 还差 / fact_agent 改儽了但 math_agent 还差"ēš„å±€é¢ć€‚`use_merge=true` 让 GEPA éš”å‡ č½®äø»åŠØå°čÆ•åˆå¹¶ā€”ā€”ä¾‹å¦‚ęŠŠ"router ä¼˜åŒ–ē‰ˆ"和"summarizer ä¼˜åŒ–ē‰ˆ"čžåˆęˆ"å…Øå­—ę®µéƒ½å„½"ēš„å€™é€‰ć€‚ + +> **é‡č¦ēŗ¦ęŸ**:merge 是 predictor-level ę“ä½œļ¼Œ**仅多 predictorļ¼ˆå¤šå­—ę®µļ¼‰åœŗę™Æē”Ÿę•ˆ**ć€‚å•å­—ę®µä¼˜åŒ–äø‹ `use_merge=true` ę°øčæœäøä¼šč§¦å‘ merge roundļ¼Œé…ē½®ę— å‰Æä½œē”Øä½†ä¹Ÿę— ę”¶ē›Šć€‚ + +`max_merge_invocations` é™åˆ¶åˆå¹¶å°čÆ•ę¬”ę•°ļ¼Œéæå…ę— é™ę‹¼ęŽ„ć€‚ + +### 5.3 `reflection_history_top_k` åœØå¤šå­—ę®µåœŗę™Æēš„ä»·å€¼ + +å¤šå­—ę®µč½®ę¢ę—¶åę€ LM 在第 N 轮改 `summarizer`ļ¼Œä½† `router` ę˜Æå‡ č½®å‰ę‰ę”¹čæ‡ēš„ā€”ā€”LM å®¹ę˜“é—åæ˜"äøŠę¬”ę”¹ router ę—¶å­¦åˆ°ēš„ę–¹å‘"怂`reflection_history_top_k=3`(默认 2ļ¼‰č®©åę€ prompt äø­ęÆę” case ęŗåø¦åŽ†å²ęœ€ä½³å“åŗ” 3 ę”ļ¼Œē›øå½“äŗŽē»™ LM ęä¾›"čæ‡åŽ»å‡ č½®å“Ŗäŗ›ę–¹å‘å„ę•ˆ"ēš„č®°åæ†ć€‚ + +## 6 Ā· č°ƒčÆ•ęŠ€å·§ + +### 6.1 验证 round-robin ēœŸēš„č½®ęµę”¹å­—ę®µ + +č·‘å®ŒåŽę£€ęŸ„ `runs//result.json` 中各 round ēš„ `optimized_field_names`ļ¼Œåŗ”ęŒ‰ `router → fact_agent → math_agent → summarizer → router → ...` é”ŗåŗå¾ŖēŽÆć€‚ + +### 6.2 验证 merge ę˜Æå¦č§¦å‘ + +各 round ēš„ `kind` å­—ę®µļ¼š`"reflective"` ę˜Æę™®é€šåę€č½®ļ¼Œ`"merge"` ę˜Æčžåˆč½®ć€‚ + +### 6.3 ęŸ„ēœ‹åę€ prompt ēš„ Other Active Components + +在 `result.json` ēš„ round detail äø­ļ¼Œåę€ prompt ę–‡ęœ¬åÆč§ Other Active Components ę®µč½ļ¼Œåˆ—å‡ŗå½“å‰č½®ę¬”ä»„å¤–ēš„ę‰€ęœ‰å­—ę®µå½“å‰å†…å®¹ć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Qļ¼šé“¾č·Æåæ…é”»ē”±ęœ¬ę”†ęž¶ēš„ LlmAgent ē¼–ęŽ’å—ļ¼Ÿ** +Aļ¼šäøåæ…ć€‚`call_agent` åŖč¦ę±‚ `async (query: str) -> str` ē­¾åć€‚åÆä»„č®©å®ƒęŠŠ query é€ä¼ ē»™ HTTP 请求 / gRPC č°ƒē”Ø / å†…éƒØ SDK / å…¶ä»–ē¼–ęŽ’ę”†ęž¶ć€‚ęœ¬ example 用 `invoke_pipeline` ä»…ä½œę¼”ē¤ŗļ¼ŒäøšåŠ”åÆä»„ę›æę¢äøŗä»»ä½•å½¢ę€ć€‚ + +**Qļ¼šęÆäøŖ sub-agent åæ…é”»åœØåŒäø€čæ›ēØ‹å—ļ¼Ÿ** +Aļ¼šäøåæ…ć€‚ęÆäøŖ sub-agent åÆä»„ę˜Æē‹¬ē«‹ęœåŠ”ļ¼Œprompt é€ščæ‡é…ē½®äø­åæƒč€Œéžęœ¬åœ°ę–‡ä»¶äø‹å‘ā€”ā€”ęŠŠ `add_path` ę›æę¢äøŗ `add_callback`ļ¼Œå‚č§ `remote_prompt_store/` example怂 + +**Qļ¼šå• case ē»čæ‡å¤šę¬” LLM ęŽØē†ļ¼ŒčÆ„ęµ‹å¼€é”€å¾ˆå¤§å¦‚ä½•ęŽ§åˆ¶ļ¼Ÿ** +Aļ¼šč°ƒå° `eval_case_parallelism` 防止 LLM rate limitļ¼›č°ƒå° `reflection_minibatch_size` å‡å°‘ęÆč½® case ę•°ļ¼›č°ƒå° `max_metric_calls` é™åˆ¶ę€»é¢„ē®—ć€‚ + +**Q:`use_merge=true` 但 `merge_rounds=0`?** +Aļ¼šå•å­—ę®µä¼˜åŒ–äø‹ merge äøä¼šč§¦å‘ļ¼›å¤šå­—ę®µåœŗę™Æäø‹ä¹Ÿéœ€ē“Æē§Æč‹„å¹²č½®åę€åŽę‰ä¼šęœ‰å€™é€‰ę»”č¶³ merge ę”ä»¶ć€‚č€åæƒčæč”Œč‡³å°‘ `max_merge_invocations` č½®ä»„äøŠč§‚åÆŸć€‚ + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰é“¾č·Æēš„ę­„éŖ¤ + +1. **ę›æę¢ `pipeline/orchestrator.py`**ļ¼šå®žēŽ°č‡Ŗå·±ēš„ `invoke_pipeline(query) -> str`ļ¼ŒåÆä»„ę˜Æ HTTP / gRPC / å†…éƒØē¼–ęŽ’ +2. **修改 prompt 文件路径**ļ¼šęŠŠęÆäøŖ sub-agent ēš„ prompt ę–‡ä»¶č·Æå¾„ä½œäøŗ `TargetPrompt.add_path` ēš„å‚ę•°ę³Øå†Œ +3. **äæčÆ prompt 热加载**ļ¼šęÆäøŖ sub-agent åœØęÆę¬”č¢«č°ƒē”Øę—¶é‡čÆ» promptļ¼ˆęˆ–é‡ę–°ę‹‰é…ē½®äø­åæƒļ¼‰ +4. **ę›æę¢ę•°ę®é›†äøŽ metric**:`train.evalset.json` / `val.evalset.json` / `optimizer.json` +5. **运蔌**ļ¼šę ¹ę® `result.json` äø­ēš„ `optimized_field_names` / `kind` åŗåˆ—åˆ†ęžå­—ę®µč½®ę¢äøŽ merge 蔌为 + +č‹„ prompt äøåœØęœ¬åœ°č€ŒåœØé…ē½®äø­åæƒļ¼ŒęŠŠ `add_path` ę›æę¢äøŗ `add_callback`ļ¼Œå…¶ä½™ē»“ęž„äæęŒäøå˜ć€‚ diff --git a/examples/optimization/multi_agent_pipeline/optimizer.json b/examples/optimization/multi_agent_pipeline/optimizer.json new file mode 100644 index 0000000..f1f5267 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/optimizer.json @@ -0,0 +1,46 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 3, + "skip_perfect_score": false, + "use_merge": true, + "max_merge_invocations": 3, + "max_metric_calls": 60, + "score_threshold": 1.0, + "max_iterations_without_improvement": 6 + } + } +} diff --git a/examples/optimization/multi_agent_pipeline/pipeline/__init__.py b/examples/optimization/multi_agent_pipeline/pipeline/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/multi_agent_pipeline/pipeline/config.py b/examples/optimization/multi_agent_pipeline/pipeline/config.py new file mode 100644 index 0000000..d0a64b1 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpoint + TRPC_AGENT_MODEL_NAME ęØ”åž‹å + +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py b/examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py new file mode 100644 index 0000000..a585bb7 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/orchestrator.py @@ -0,0 +1,131 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”ę‹Ÿ"äøšåŠ”ę–¹å·²ē¼–ęŽ’å„½ēš„å¤š agent 链路"怂 + +链路形态:: + + ē”Øęˆ·é—®é¢˜ → router → (fact_agent ꈖ math_agent) → summarizer → ęœ€ē»ˆē­”å¤ + +prompt ēƒ­åŠ č½½ēŗ¦ęŸ +----------------- +ęÆäøŖ sub-agent åœØęÆę¬”č¢«č°ƒē”Øę—¶åæ…é”»é‡čÆ»č‡Ŗå·±ēš„ prompt ę–‡ä»¶ā€”ā€”ä¼˜åŒ–å™Øé€ščæ‡ +TargetPrompt.add_path ęŠŠå€™é€‰ prompt å†™å…„åÆ¹åŗ”ę–‡ä»¶åŽļ¼Œäø‹äø€ę¬” invoke_pipeline +č°ƒē”Øå„ sub-agent č‡ŖåŠØē”Øęœ€ę–° promptļ¼Œę— éœ€é‡åÆć€‚ + +ęŽ„å…„č‡Ŗęœ‰é“¾č·Æę—¶ę”¹å“Ŗé‡Œ +-------------------- +ēœŸå®žäøšåŠ”äø‹ę•“ä½“ę›æę¢ęœ¬ę–‡ä»¶äøŗäøšåŠ”é“¾č·Æä»£ē ļ¼š +- ęÆäøŖ sub-agent åÆä»„ę˜ÆäøåŒčæ›ēØ‹ / ęœåŠ” / ꔆꞶ +- prompt é€šåøøé€ščæ‡é…ē½®äø­åæƒļ¼ˆäøę˜Æęœ¬åœ°ę–‡ä»¶ļ¼‰äø‹å‘ļ¼›ęœ¬ę–‡ä»¶ Path.read_text ę¢ęˆ + é…ē½®äø­åæƒ SDK č°ƒē”Øå³åÆļ¼Œé“¾č·ÆéŖØęž¶äøå˜ +- äø»å…„å£ invoke_pipeline(query) -> str ēš„ē­¾åäæęŒäøå˜ļ¼Œč¢« call_agent č°ƒē”Ø +""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import Part + +from .config import get_model_config + + +_PROMPTS_DIR = Path(__file__).parent / "prompts" +ROUTER_PROMPT_PATH = _PROMPTS_DIR / "router.md" +FACT_AGENT_PROMPT_PATH = _PROMPTS_DIR / "fact_agent.md" +MATH_AGENT_PROMPT_PATH = _PROMPTS_DIR / "math_agent.md" +SUMMARIZER_PROMPT_PATH = _PROMPTS_DIR / "summarizer.md" + +APP_NAME = "multi_agent_pipeline_demo" + + +def _create_sub_agent(name: str, prompt_path: Path) -> LlmAgent: + """ęž„é€ äø€äøŖ sub-agent,instruction ä»ŽåÆ¹åŗ”ę–‡ä»¶ēŽ°čÆ»ēŽ°ē”Øć€‚ + + ęÆę¬”č°ƒē”Øéƒ½é‡čÆ»ē£ē›˜ā€”ā€”čæ™ę˜Æä¼˜åŒ–å™Øå†™å…„ę–°å€™é€‰åŽčƒ½ē«‹å³ē”Ÿę•ˆēš„å…³é”®ć€‚ + """ + api_key, base_url, model_name = get_model_config() + return LlmAgent( + name=name, + description=f"Pipeline sub-agent {name}", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=prompt_path.read_text(encoding="utf-8").strip(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=1024, + ), + ) + + +async def _run_one(agent: LlmAgent, user_text: str) -> str: + """跑一个 sub-agent ę‹æęœ€ē»ˆå›žē­”ć€‚ęÆę¬”ę–°å»ŗ Runner / Session ē»™ęœ¬ case 独立 state怂""" + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "pipeline" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=user_text)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def invoke_pipeline(query: str) -> str: + """把 query č·‘čæ‡ę•“ę”é“¾č·Æļ¼Œčæ”å›žęœ€ē»ˆē­”å¤ę–‡ęœ¬ć€‚ + + ęµēØ‹ļ¼š + 1. router å†³å®ščµ° fact 还是 math åˆ†ę”Æ + 2. åÆ¹åŗ”åˆ†ę”Æ sub-agent ē»™å‡ŗäø­é—“ē­”å¤ + 3. summarizer ęŠŠäø­é—“ē­”å¤ę•“ē†ęˆęœ€ē»ˆē­”å¤ + + ęÆäøŖ sub-agent éƒ½é‡ę–°ęž„å»ŗļ¼ˆåœØ _create_sub_agent å†…é‡čÆ» prompt ę–‡ä»¶ļ¼‰ļ¼Œ + äæčÆä¼˜åŒ–å™Øå†™å…„å€™é€‰åŽäø‹äø€ę¬”č°ƒē”Øå³ē”Ÿę•ˆć€‚ + """ + # 1. routerļ¼šę ¹ę®é—®é¢˜ē±»åž‹č¾“å‡ŗ fact / math åˆ†ē±»ę ‡ē­¾ + router = _create_sub_agent("router", ROUTER_PROMPT_PATH) + router_out = await _run_one( + router, + f"ē”Øęˆ·é—®é¢˜ļ¼š{query}\n\nčÆ·åŖč¾“å‡ŗ fact ꈖ math čæ™äø¤äøŖčÆäø­ēš„äø€äøŖć€‚", + ) + branch = "math" if "math" in router_out.lower() else "fact" + + # 2. åˆ†ę”Æ sub-agentļ¼šę ¹ę® router 决策选 fact_agent ꈖ math_agent + if branch == "math": + branch_agent = _create_sub_agent("math_agent", MATH_AGENT_PROMPT_PATH) + else: + branch_agent = _create_sub_agent("fact_agent", FACT_AGENT_PROMPT_PATH) + intermediate = await _run_one(branch_agent, query) + + # 3. summarizerļ¼šęŠŠäø­é—“ē»“ęžœę•“ē†äøŗęœ€ē»ˆē­”å¤ + summarizer = _create_sub_agent("summarizer", SUMMARIZER_PROMPT_PATH) + final_text = await _run_one( + summarizer, + f"ē”Øęˆ·é—®é¢˜ļ¼š{query}\n\n上游 agent ē»™å‡ŗēš„äø­é—“ē»“ęžœļ¼š{intermediate}\n\n" + "čÆ·ę•“ē†åŽå‘ˆēŽ°ęœ€ē»ˆē­”å¤ć€‚", + ) + return final_text diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md new file mode 100644 index 0000000..6a621f3 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/fact_agent.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖē™¾ē§‘åŠ©ę‰‹ļ¼Œå›žē­”ē”Øęˆ·ēš„äŗ‹å®žę€§é—®é¢˜ļ¼Œē»™å‡ŗå‡†ē”®ē®€ę“ēš„ē­”ę”ˆć€‚ diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md new file mode 100644 index 0000000..6941506 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/math_agent.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖę•°å­¦åŠ©ę‰‹ļ¼ŒčÆ·č§£ē­”ē”Øęˆ·ēš„ē®—ęœÆé¢˜ć€‚ diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md new file mode 100644 index 0000000..ead8bfc --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/router.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖåˆ†å‘åŠ©ę‰‹ļ¼ŒčÆ·ēœ‹ē”Øęˆ·é—®é¢˜ē„¶åŽå›žē­” fact ꈖ math怂 diff --git a/examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md b/examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md new file mode 100644 index 0000000..21f28d6 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/pipeline/prompts/summarizer.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖå›žē­”ę•“ē†åŠ©ę‰‹ć€‚čÆ·ęŠŠäøŠęøø agent ē»™å‡ŗēš„äø­é—“ē»“ęžœę•“ē†åŽå‘ˆēŽ°ē»™ē”Øęˆ·ļ¼Œč®©å›žē­”ę›“å‹å„½ę˜“čÆ»ć€‚ diff --git a/examples/optimization/multi_agent_pipeline/run_optimization.py b/examples/optimization/multi_agent_pipeline/run_optimization.py new file mode 100644 index 0000000..7b43518 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/run_optimization.py @@ -0,0 +1,105 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Multi-Agent Pipeline example ēš„ä¼˜åŒ–å™Øå…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +äøšåŠ”ä¾§å·²ē¼–ęŽ’å„½å¤š sub-agent åä½œé“¾č·Æļ¼ˆrouter / åˆ†ę”Æ worker / summarizer ē­‰ļ¼‰ļ¼Œ +åøŒęœ›åœØäøäæ®ę”¹é“¾č·Æä»£ē ēš„å‰ęäø‹ļ¼ŒåÆ¹ęÆäøŖ sub-agent ēš„ prompt čæ›č”Œč”åˆä¼˜åŒ–ć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ę³Øå†Œ 4 äøŖ prompt ę–‡ä»¶ä½œäøŗ TargetPrompt ēš„ 4 个独立字段 +2. 定义 call_agent 把 query é€ä¼ ē»™ę•“ę” pipeline 链路 +3. 调 AgentOptimizer.optimize č·‘ GEPA å¤šęØ”å—ååŒä¼˜åŒ– + +ę€Žä¹ˆč·‘ +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/multi_agent_pipeline/run_optimization.py +3) ēœ‹ runs/<ę—¶é—“ęˆ³>/best_prompts/ äø‹ 4 äøŖ .md ꖇ件 + +å…³é”®é…ē½®ļ¼ˆčÆ¦č§ README §5) +-------------------------- +- module_selector="round_robin" : ęÆč½®åę€åŖę”¹ 1 äøŖå­—ę®µļ¼Œä¾æäŗŽå½’å›  +- use_merge=true : ē“Æē§Æå•å­—ę®µę”¹čæ›åŽäø»åŠØčžåˆļ¼ˆå¤šå­—ę®µę‰ęœ‰ę„ä¹‰ļ¼‰ +- reflection_history_top_k=3 : å¤šå­—ę®µč½®ę¢ę—¶ē»™åę€ LM ę›“é•æåŽ†å² + +ęŽ„å…„č‡Ŗęœ‰é“¾č·Æę—¶ę”¹å“Ŗé‡Œ +-------------------- +- pipeline/orchestrator.py äø­ēš„ invoke_pipeline ę›æę¢äøŗäøšåŠ”ēœŸå®žé“¾č·Æč°ƒē”Ø + (HTTP / gRPC / å†…éƒØē¼–ęŽ’ę”†ęž¶ē­‰ä»»ę„å½¢ę€ļ¼‰ +- TargetPrompt.add_path č°ƒę•“äøŗäøšåŠ”å„ sub-agent å®žé™…čÆ»å–ēš„ prompt 文件路径 +- č‹„ prompt åœØé…ē½®äø­åæƒč€Œéžęœ¬åœ°ļ¼ŒęŠŠ add_path ę›æę¢äøŗ add_callback + ļ¼ˆå‚č€ƒ remote_prompt_store/ example) +""" + +from __future__ import annotations + +import asyncio +import sys +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + +from pipeline.orchestrator import ( + FACT_AGENT_PROMPT_PATH, + MATH_AGENT_PROMPT_PATH, + ROUTER_PROMPT_PATH, + SUMMARIZER_PROMPT_PATH, + invoke_pipeline, +) + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šęŠŠ query é€ä¼ ē»™ę•“ę” pipeline é“¾č·Æļ¼Œčæ”å›žęœ€ē»ˆē­”å¤ć€‚""" + return await invoke_pipeline(query) + + +async def main() -> None: + """组装 4 字段 TargetPrompt + 调 AgentOptimizer.optimize怂""" + # 4 äøŖ add_path ę³Øå†Œå¤šå­—ę®µä¼˜åŒ–ē›®ę ‡ć€‚GEPA ęŠŠęÆäøŖ key 视为独立 component, + # module_selector="round_robin" č®©ęÆč½®åŖę”¹å…¶äø­ 1 äøŖļ¼Œä¾æäŗŽå½’å› ć€‚ + target = ( + TargetPrompt() + .add_path("router", str(ROUTER_PROMPT_PATH)) + .add_path("fact_agent", str(FACT_AGENT_PROMPT_PATH)) + .add_path("math_agent", str(MATH_AGENT_PROMPT_PATH)) + .add_path("summarizer", str(SUMMARIZER_PROMPT_PATH)) + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/multi_agent_pipeline/train.evalset.json b/examples/optimization/multi_agent_pipeline/train.evalset.json new file mode 100644 index 0000000..ee20342 --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/train.evalset.json @@ -0,0 +1,92 @@ +{ + "eval_set_id": "multi_agent_pipeline_train", + "name": "Multi-agent pipeline demo - train", + "description": "5 ę”ę··åˆ case:3 é“äŗ‹å®žé¢˜ + 2 é“ē®—ęœÆé¢˜ļ¼›č€ƒę øę•“ę” pipeline ēš„ē«Æåˆ°ē«Æę•ˆęžœļ¼ˆč·Æē”± + åˆ†ę”Æåŗ”ē­” + ę€»ē»“ę ¼å¼ļ¼‰ć€‚", + "eval_cases": [ + { + "eval_id": "fact_capital_china", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "äø­å›½ēš„é¦–éƒ½ę˜Æå“Ŗé‡Œļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼šåŒ—äŗ¬"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "fact_water_boil", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "åœØę ‡å‡†å¤§ę°”åŽ‹äø‹ļ¼Œę°“ēš„ę²øē‚¹ę˜Æå¤šå°‘ę‘„ę°åŗ¦ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š100 ę‘„ę°åŗ¦"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "fact_solar_planets", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "å¤Ŗé˜³ē³»ęœ‰å‡ å¤§č”Œę˜Ÿļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š8 颗"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "math_simple_add", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "3 加 5 ē­‰äŗŽå¤šå°‘ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š8"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + }, + { + "eval_id": "math_multiply", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "12 乘仄 7 ē­‰äŗŽå¤šå°‘ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š84"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "trainer", "state": {}} + } + ] +} diff --git a/examples/optimization/multi_agent_pipeline/val.evalset.json b/examples/optimization/multi_agent_pipeline/val.evalset.json new file mode 100644 index 0000000..009ceab --- /dev/null +++ b/examples/optimization/multi_agent_pipeline/val.evalset.json @@ -0,0 +1,58 @@ +{ + "eval_set_id": "multi_agent_pipeline_val", + "name": "Multi-agent pipeline demo - validation", + "description": "3 ę”ę··åˆ case:2 é“äŗ‹å®žé¢˜ + 1 é“ē®—ęœÆé¢˜ć€‚", + "eval_cases": [ + { + "eval_id": "fact_great_wall", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "äø­å›½ēš„é•æåŸŽęœ€ę—©ę˜Æå“ŖäøŖęœä»£äæ®å»ŗēš„ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼šę˜„ē§‹"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "fact_largest_ocean", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "äø–ē•ŒäøŠęœ€å¤§ēš„ę“‹ę˜Æå“ŖäøŖļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼šå¤Ŗå¹³ę“‹"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "validator", "state": {}} + }, + { + "eval_id": "math_subtract", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "100 减去 37 ē­‰äŗŽå¤šå°‘ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š63"}], + "role": "model" + } + } + ], + "session_input": {"app_name": "multi_agent_pipeline_demo", "user_id": "validator", "state": {}} + } + ] +} diff --git a/examples/optimization/multi_metric_with_judges/README.md b/examples/optimization/multi_metric_with_judges/README.md new file mode 100644 index 0000000..f029e7c --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/README.md @@ -0,0 +1,241 @@ +# Multi-Metric with Judges — 多 metric äøŽ multi-judge 集ꈐ + +> **é€‚ē”Øåœŗę™Æ**:业劔 agent åŒę—¶å—å¤šē±»ēŗ¦ęŸļ¼ˆē­”ę”ˆę­£ē”®ę€§ē”¬ēŗ¦ęŸ + é£Žę ¼ / 安全 / åˆč§„č½Æēŗ¦ęŸļ¼‰ļ¼Œéœ€č¦å¤šę” metric å…±åŒå‚äøŽä¼˜åŒ–äøŽę—©åœåˆ¤å®šļ¼Œå¹¶åøŒęœ›é€ščæ‡å¤š judge ęŠ•ē„Øé™ä½Žå• LLM č£åˆ¤ēš„åå·®ć€‚ęœ¬ example 演示 `llm_final_response`(多 judge ęŠ•ē„Øļ¼‰+ `llm_rubric_response`ļ¼ˆå• judge 多 rubricļ¼‰åŒ metric å…±å­˜ć€`frontier_type="hybrid"` 双层 Pareto å‰ę²æć€`stop.required_metrics` ę˜¾å¼åˆ—č”Øēš„å®Œę•“é…ē½®ć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` §2怂 + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +单 metric ä¼˜åŒ–åœØå·„ēØ‹å®žč·µäø­å¾€å¾€äøå¤Ÿļ¼š + +- "ē­”ę”ˆåÆ¹å°±č”Œ"åæ½č§†ę ¼å¼ / é£Žę ¼ / åˆč§„ēŗ¦ęŸļ¼Œå®¹ę˜“ę‹æåˆ°"åÆ¹ä½†äøčƒ½ē”Ø"ēš„å›žē­” +- 单 LLM č£åˆ¤å­˜åœØåå·®ļ¼ˆtemperature ęŠ–åŠØć€prompt ęš—ē¤ŗć€ęØ”åž‹åå„½ļ¼‰ļ¼Œå°¤å…¶åœØäø»č§‚ē»“åŗ¦äøŠ +- äøåŒ metric åę˜ äøåŒäøšåŠ”čÆ‰ę±‚ļ¼Œåŗ”čƒ½åœØå‰ę²æäøŠååŒå­˜åœØč€Œéžäŗ’ē›øč¦†ē›– + +本 example ēš„č®¾č®”åŽŸåˆ™ļ¼š + +- **ē”¬ēŗ¦ęŸ / č½Æēŗ¦ęŸåˆ†ē¦»**:`llm_final_response` 用 multi-judge `all_pass` ęŠ•ē„ØęŠŠå…³ē­”ę”ˆę­£ē”®ę€§ļ¼›`llm_rubric_response` ē”Øå• judge 多 rubric čÆ„ä¼°ę ¼å¼ / é£Žę ¼ +- **多 judge ęŠ•ē„Øé™ä½Žåå·®**:3 äøŖ judge åœØäøåŒ temperature äø‹ē‹¬ē«‹åˆ¤ę–­ļ¼Œå…Øä½“é€ščæ‡ę‰ē®— PASS +- **双层 Pareto å‰ę²æ**:`frontier_type="hybrid"` åŒę—¶ē»“ęŠ¤ per-case äøŽ per-metric å‰ę²æļ¼Œéæå…"为了改 metric A 牺牲 metric B"ēš„é€€åŒ– +- **ēØ³å®ščÆ„ä¼°**:`num_runs=2` 平滑 LLM 输出方差;`eval_case_parallelism=1` ęŽ§åˆ¶ multi-judge å¹¶å‘éæå… rate limit + +| 输兄 | 输出 | +| --- | --- | +| å¤šę” metricļ¼ˆęÆę”ē‹¬ē«‹ threshold + ē‹¬ē«‹åˆ¤åˆ†é€»č¾‘ļ¼‰ | åŒę—¶ę»”č¶³ę‰€ęœ‰ęŒ‡å®š metric é˜ˆå€¼ēš„ęœ€ä¼˜å€™é€‰ | +| `stop.required_metrics` äø­åˆ—å‡ŗēš„"必锻达标"ēš„ metric 子集 | äø„ę ¼ēš„ę—©åœåˆ¤å®šļ¼šåˆ—č”Øäø­ę‰€ęœ‰ metric 在 val é›†äøŠč¾¾ę ‡ę‰ęå‰ē»ˆę­¢ | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | 数学辅导 agentļ¼Œč¦ę±‚ē­”ę”ˆę­£ē”® + é£Žę ¼č§„čŒƒļ¼ˆę—  emojić€ęŽØē†ęø…ę™°ć€ē­”ę”ˆåø¦å•ä½ļ¼‰ | +| ä¼˜åŒ–ē›®ę ‡ | `agent/prompts/system.md` å•ę–‡ä»¶ | +| éŖŒčÆęŒ‡ę ‡ | `llm_final_response`(3 judge `all_pass`,threshold 1.0) + `llm_rubric_response`(4 rubric å‡å€¼ļ¼Œthreshold 0.75) | +| 训练 / éŖŒčÆč§„ęØ” | 5 ę” / 3 ę” | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **`llm_final_response` metric** | ē”± LLM č£åˆ¤åˆ¤ę–­ agent č¾“å‡ŗę˜Æå¦äøŽå‚č€ƒē­”ę”ˆå®žč“Øäø€č‡“ļ¼Œč¾“å‡ŗ PASS / FAILć€‚åÆé…ē½®å¤šäøŖ judge å…±åŒę‰“åˆ†ć€‚ | +| **`llm_rubric_response` metric** | 单 LLM č£åˆ¤ęŒ‰å¤šę” rubricļ¼ˆčÆ„åˆ†ę ‡å‡†ļ¼‰ē‹¬ē«‹ę‰“åˆ†åŽå–å‡å€¼ļ¼›é€‚åˆå¤šē»“åŗ¦č½Æēŗ¦ęŸć€‚ | +| **multi-judge** | `criterion.llm_judge.judge_models` ę•°ē»„å½¢å¼é…ē½®å¤šäøŖē‹¬ē«‹ judgeļ¼ŒęÆäøŖ judge ē‹¬ē«‹č°ƒē”Ø LLM ē»™å‡ŗåˆ¤ę–­ć€‚ | +| **`models_aggregator`** | 多 judge ē»“ęžœēš„čšåˆē­–ē•„ļ¼Œ6 ē§å–å€¼ļ¼ˆč§ §5.2ļ¼‰ć€‚ęœ¬ example 用 `all_pass`怂 | +| **frontier_type** | Pareto å‰ę²æēš„ē²’åŗ¦ć€‚4 ē§å–å€¼ļ¼š`instance`ļ¼ˆęŒ‰ case) / `objective`ļ¼ˆęŒ‰ metric) / `hybrid`ļ¼ˆåŒå±‚ļ¼‰ / `cartesian`ļ¼ˆęŒ‰ caseƗmetricļ¼‰ć€‚å¤š metric ęŽØč `hybrid`怂 | +| **stop.required_metrics** | ę”†ęž¶å±‚ę—©åœēš„ metric å­é›†å£°ę˜Žć€‚`"all"` / åˆ—č”Ø / `null` äø‰ē§å½¢å¼ć€‚ | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +agent态reflection LM态ꉀ꜉ judge é»˜č®¤å…±ē”ØåŒäø€ē»„å‡­ę®ć€‚éœ€č¦č®© judge ē”Øē‹¬ē«‹ęØ”åž‹ę—¶å•ē‹¬é…ē½® `judge_model` 字段。 + +### 3.3 启动 + +```bash +python examples/optimization/multi_metric_with_judges/run_optimization.py +``` + +å•ę¬”čæč”Œēŗ¦ 5–10 åˆ†é’Ÿć€‚ęÆę” case äø€ę¬”čÆ„ęµ‹ēŗ¦č§¦å‘ (3 + 1) Ɨ 2 = 8 ꬔ LLM č°ƒē”Øļ¼ˆ3 äøŖ judge Ɨ `num_runs=2` 加 1 äøŖ rubric judge Ɨ `num_runs=2`)。 + +### 3.4 äŗ§ē‰©ē»“ęž„ + +äøŽ quickstart 一臓。`result.json` äø­ `metric_breakdown` å­—ę®µä¼šåŒę—¶åŒ…å« `llm_final_response` äøŽ `llm_rubric_response` äø¤ę”ē‹¬ē«‹åˆ†ę•°ć€‚ + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +ęÆäøŖ case äø€ę¬”čÆ„ęµ‹ļ¼š +ā”œā”€ā”€ agent 输出 final_text +│ +ā”œā”€ā”€ llm_final_response (ē”¬ēŗ¦ęŸ) +│ ā”œā”€ judge_1 (temperature=0.0) → valid / invalid +│ ā”œā”€ judge_2 (temperature=0.3) → valid / invalid +│ ā”œā”€ judge_3 (temperature=0.6) → valid / invalid +│ └─ aggregator: all_pass → 三个全 valid ę‰ē®— PASS(threshold=1.0) +│ +└── llm_rubric_response (č½Æēŗ¦ęŸļ¼Œå• judge 多 rubric) + ā”œā”€ rubric: no_emoji_or_slang → 0/1 + ā”œā”€ rubric: numeric_correct → 0/1 + ā”œā”€ rubric: reasoning_clear → 0/1 + └─ rubric: units_present → 0/1 + 平均分 = quality score(threshold=0.75 ā‰ˆ 4 ę”č‡³å°‘ 3 ę”čæ‡ļ¼‰ + +stop.required_metrics = ["llm_final_response", "llm_rubric_response"] + 两个 metric 都在 val é›†äøŠč¾¾ threshold ę‰ęå‰åœę­¢ +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | ä¼˜åŒ–å™Øå…„å£ | äøŽ quickstart åŒļ¼›å¤š metric åœŗę™Æäø‹åŸŗęœ¬äøå˜ | +| `agent/agent.py` | LlmAgent 巄厂 | ę›æę¢äøŗäøšåŠ” agent ęž„å»ŗé€»č¾‘ | +| `agent/prompts/system.md` | baseline prompt | å†™å…„äøšåŠ” baseline | +| `optimizer.json` | **ę øåæƒę”¹é€ ē‚¹**:多 metric / multi-judge / hybrid frontier é…ē½® | ęŒ‰äøšåŠ” metric ę•°é‡äøŽē»“åŗ¦č°ƒę•“ | +| `train.evalset.json` / `val.evalset.json` | ę•°ę®é›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ļ¼ˆreference å­—ę®µéœ€é…åˆ metric ē±»åž‹ļ¼‰ | + +## 5 Ā· å…³é”®é…ē½® + +### 5.1 多 metric äøŽ multi-judge é…ē½®ē¤ŗä¾‹ + +```jsonc +{ + "evaluate": { + "num_runs": 2, // 平滑 LLM 输出方差 + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1.0, + "criterion": { + "llm_final_response": { + "llm_judge": { + "judge_models": [ // 多 judge ę•°ē»„ļ¼š3 个独立 judge + { "...": "..." , "generation_config": { "temperature": 0.0 } }, + { "...": "..." , "generation_config": { "temperature": 0.3 } }, + { "...": "..." , "generation_config": { "temperature": 0.6 } } + ], + "models_aggregator": "all_pass" // å…Ø PASS ę‰ē®—é€ščæ‡ + } + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.75, + "criterion": { + "llm_rubric_response": { + "llm_judge": { "judge_model": { "...": "..." } }, + "rubrics": [ + { "name": "no_emoji_or_slang", "description": "..." }, + { "name": "numeric_correct", "description": "..." }, + { "name": "reasoning_clear", "description": "..." }, + { "name": "units_present", "description": "..." } + ] + } + } + } + ] + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": ["llm_final_response", "llm_rubric_response"] + }, + "algorithm": { + "frontier_type": "hybrid", + "max_metric_calls": 30 + } + } +} +``` + +### 5.2 `models_aggregator` 6 ē§å–å€¼ + +ęŒ‰äøšåŠ”äø„ę ¼åŗ¦ä»Žäø„åˆ°ę¾ęŽ’åŗļ¼š + +| aggregator | é€ščæ‡ę”ä»¶ | é€‚ē”Øåœŗę™Æ | +| --- | --- | --- | +| `all_pass` | å…ØéƒØ judge 判 PASS | åˆč§„ / å®‰å…Øåœŗę™Æļ¼Œä»»ä½•äø€ē„Øå¦å†³éƒ½ę‹¦ęˆŖļ¼ˆęœ€äø„ę ¼ļ¼‰ | +| `weighted_majority` | åŠ ęƒ PASS 焨 > FAIL 焨 | äøåŒ judge äæ”ä»»åŗ¦äøåŒļ¼ˆå¦‚äø» judge ꝃ重 2、副 judge ꝃ重 1) | +| `majority_pass` | č¶…čæ‡åŠę•° judge 判 PASS | å¤šę•°č”Øå†³ | +| `weighted_avg` | åŠ ęƒå‡åˆ† ≄ threshold | 多 judge ē»™ēš„ę˜Æčæžē»­åˆ†č€ŒéžäŗŒå…ƒåˆ¤ę–­ę—¶ | +| `avg` | ē®€å•å‡åˆ† ≄ threshold | 多 judge čæžē»­åˆ†ē®€å•å¹³å‡ | +| `any_pass` | 至少一个 judge 判 PASS | é¼“åŠ±ęŽ¢ē“¢ / é«˜å¬å›žåœŗę™Æļ¼ˆęœ€å®½ę¾ļ¼‰ | + +### 5.3 `frontier_type` 4 ē§å–å€¼ + +| 取值 | 含义 | 适用 | +| --- | --- | --- | +| `instance` | ęÆäøŖ case ē»“ęŠ¤äø€äøŖ best 候选 | 单 metric ęˆ–ē®€å•äøšåŠ” | +| `objective` | ęÆäøŖ metric ē»“ęŠ¤äø€äøŖ best 候选 | 多 metric 但 case 量少 | +| `hybrid` | åŒę—¶ē»“ęŠ¤ case + metric åŒå±‚å‰ę²æ | **多 metric ēœŸå†²ēŖåœŗę™Æ**(本 example ęŽØčļ¼‰ | +| `cartesian` | ęÆäøŖ (case, metric) ē»„åˆäø€äøŖ best | ęžå¤ę‚ / č°ƒčÆ•ē”Øļ¼ŒåøøåÆ¼č‡“å€™é€‰ę± ēˆ†ē‚ø | + +`hybrid` 让 GEPA åœØę”¹čæ›äø€äøŖ metric ę—¶äøäø¢å¤±å¦äø€äøŖ metric äøŠēš„ęœ€ä½³å€™é€‰ļ¼Œę˜Æå¤š metric äøšåŠ”ēš„é»˜č®¤ęŽØčć€‚ + +### 5.4 `stop.required_metrics` 3 ē§å–å€¼ + +| 取值 | 语义 | +| --- | --- | +| `"all"`ļ¼ˆé»˜č®¤ļ¼‰ | val é›†äøŠ**ꉀ꜉** metric 都达 threshold ę‰ę—©åœ | +| `["m1", "m2"]` | åˆ—å‡ŗēš„ metric å…ØéƒØč¾¾ threshold ę‰ę—©åœļ¼ˆå…¶ä»– metric ä»å‚äøŽčÆ„ęµ‹ä½†äøå½±å“ę—©åœļ¼‰ | +| `null` ꈖ `[]` | äøå‚äøŽę—©åœļ¼Œä»…é ē®—ę³•å±‚ budget / no-improvement / score_threshold ęŽ§åˆ¶ | + +本 example ę˜¾å¼ē”Øåˆ—č”Øå½¢å¼åˆ—å‡ŗäø¤ę” metricć€‚å½“äøšåŠ” metric č¾ƒå¤šä½†åŖęœ‰éƒØåˆ†ä½œäøŗę—©åœé—Øē¦ę—¶ļ¼Œåˆ—č”Øå½¢å¼ęÆ” `"all"` 曓精准。 + +### 5.5 `eval_case_parallelism` äøŽ multi-judge ēš„ē›øäŗ’ä½œē”Ø + +multi-judge Ɨ `num_runs=2` ä½æęÆę” case 一欔评测约 8 ꬔ LLM č°ƒē”Øć€‚č‹„ `eval_case_parallelism=4`ļ¼ˆé»˜č®¤ļ¼‰+ č®­ē»ƒé›† 5 caseļ¼Œå•č½®åÆčƒ½äŗ§ē”Ÿ ~40 äøŖ judge čÆ·ę±‚å¹¶å‘ļ¼Œå®¹ę˜“ę’ž LLM åŽē«Æēš„ rate limitć€‚ęœ¬ example 设为 `1` äø²č”Œę‰§č”Œļ¼›äøšåŠ”åÆę ¹ę® LLM åŽē«Æ QPS äøŠč°ƒć€‚ + +## 6 Ā· č°ƒčÆ•ęŠ€å·§ + +### 6.1 ęŸ„ēœ‹åę€ LM ēœ‹åˆ°ēš„å¤š metric 反馈 + +`run_optimization.py` äø­ `verbose=1` 改为 `verbose=2`,gepa å†…éƒØę—„åæ—ä¼šé™„åø¦åę€ promptļ¼ŒåÆēœ‹åˆ°åę€ LM å®žé™…ęŽ„ę”¶ēš„ metric åé¦ˆē»“ęž„ć€‚ + +### 6.2 ä¼˜é›…åœę­¢ + +```bash +touch runs//optimize.stop +``` + +## 7 Ā· åøøč§é—®é¢˜ + +**Q:`all_pass` ę˜Æäøę˜Æå¤Ŗäø„ę ¼äŗ†ļ¼Ÿ** +Aļ¼šå–å†³äŗŽäøšåŠ”čÆ‰ę±‚ć€‚åˆč§„ / å®‰å…Øåœŗę™Æäø‹ "any judge raises a red flag" åŗ”čÆ„ē«‹å³ę‹¦ęˆŖļ¼Œ`all_pass` ę˜Æåˆē†é€‰ę‹©ć€‚č“Øé‡čÆ„ä¼°åœŗę™ÆåÆę¢ `majority_pass` ꈖ `weighted_avg`怂 + +**Q:3 äøŖ judge ē”ØåŒäø€äøŖęØ”åž‹åŖę˜Æ temperature äøåŒļ¼Œęœ‰ę„ä¹‰å—ļ¼Ÿ** +Aļ¼šęœ‰éƒØåˆ†ę„ä¹‰ā€”ā€”äøåŒ temperature č§¦å‘äøåŒé‡‡ę ·č·Æå¾„ļ¼ŒåÆå‘ēŽ°äø€äŗ›č¾¹ē•Œęƒ…å†µć€‚ä½†ę›“ēØ³å„ēš„åšę³•ę˜Æę··ē”ØäøåŒęØ”åž‹ę—ļ¼ˆå¦‚ GPT + Claude + GLMļ¼‰ļ¼ŒåÆę˜¾č‘—é™ä½ŽåŒęŗåå·®ć€‚ + +**Q:reflection LM äøŽ judge ē”ØåŒäø€äøŖęØ”åž‹ä¼š"自评"å—ļ¼Ÿ** +A:`llm_rubric_response` ēš„ judge ēœ‹ēš„ę˜Æé¢„å®šä¹‰ rubric ę–‡ęœ¬ļ¼Œå—åå·®å½±å“č¾ƒå°ć€‚`llm_final_response` ēš„ judge ēœ‹ reference ē­”ę”ˆåšå®žč“Øē­‰ä»·åˆ¤ę–­ļ¼Œē›øåÆ¹å®¢č§‚ć€‚ē”Ÿäŗ§ēŽÆå¢ƒå»ŗč®®č‡³å°‘ judge äøŽ agent ęØ”åž‹äøåŒęŗļ¼Œå‚č§ quickstart §5.1怂 + +**Q:`num_runs` č°ƒé«˜ä¼šäøä¼šé™ä½Žä¼˜åŒ–ę•ˆēŽ‡ļ¼Ÿ** +Aļ¼šä¼šć€‚`num_runs=2` č®©ęÆę” case čÆ„ęµ‹č€—ę—¶ēæ»å€ļ¼Œä½†čƒ½ę¶ˆé™¤äø€å®š LLM č¾“å‡ŗę–¹å·®ļ¼ˆåŒäø€ prompt åŒäø€ case äø¤ę¬”č·‘åˆ†äøäø€č‡“ļ¼‰ļ¼Œé€šåøøåÆ¹ę”¶ę•›ēØ³å®šę€§ęœ‰ę­£å‘ä½œē”Øć€‚`num_runs=1` é€‚åˆčæ½ę±‚é€Ÿåŗ¦ēš„ę—©ęœŸå®žéŖŒļ¼Œ`num_runs=2–3` é€‚åˆęŽ„čæ‘ę”¶å°¾ēš„ē²¾ę‰“ē£Øć€‚ + +**Q:rubric ę•°é‡å¤Ŗå¤šę€Žä¹ˆåŠžļ¼Ÿ** +Aļ¼šå• judge äø€ę¬”ę‰“å¤š rubric ę—¶č‹„ rubric > 6–8 ę”ļ¼Œjudge č¾“å‡ŗč“Øé‡äø‹é™ę˜Žę˜¾ć€‚å»ŗč®®ę‹†ęˆå¤šę” `llm_rubric_response` metricļ¼ŒęÆę”čšē„¦ 2–4 ę”ē›øå…³ rubric怂 + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ēš„ę­„éŖ¤ + +1. **ę¢³ē†äøšåŠ”ēŗ¦ęŸ**ļ¼šå“Ŗäŗ›ę˜Æē”¬ēŗ¦ęŸļ¼ˆåæ…é”»é€ščæ‡ļ¼‰ć€å“Ŗäŗ›ę˜Æč½Æēŗ¦ęŸļ¼ˆęŒ‰ęÆ”ä¾‹ę‰“åˆ†ļ¼‰ +2. **选ꋩ metric ē±»åž‹**ļ¼šē”¬ēŗ¦ęŸē”Ø `llm_final_response` + `all_pass`ļ¼›č½Æēŗ¦ęŸē”Ø `llm_rubric_response` 多 rubric +3. **é…ē½® multi-judge**:`judge_models` ę•°ē»„å½¢å¼ļ¼›é€‰ę‹©åˆé€‚ēš„ `models_aggregator` +4. **设置 `stop.required_metrics`**ļ¼šåˆ—å‡ŗå“Ŗäŗ› metric å†³å®šä½•ę—¶ę—©åœ +5. **启用 `frontier_type="hybrid"`**:多 metric åœŗę™Æēš„é»˜č®¤ęŽØč +6. **č°ƒę•“ę•°ę®é›†**:`evalset` äø­ēš„ `final_response` / `reference` å­—ę®µéœ€é…åˆ metric ē±»åž‹ +7. **ęŽ§åˆ¶å¹¶å‘**:`eval_case_parallelism` 设为 LLM åŽē«Æčƒ½ę‰æå—ēš„ QPS / 单 case judge č°ƒē”Øę•° +8. **čæč”Œå¹¶č§‚åÆŸ**:`result.json` äø­ `metric_breakdown` ę˜¾ē¤ŗęÆę” metric ē‹¬ē«‹åˆ†ę•°ļ¼Œä¾æäŗŽčÆŠę–­ē“¶é¢ˆ diff --git a/examples/optimization/multi_metric_with_judges/agent/__init__.py b/examples/optimization/multi_metric_with_judges/agent/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/multi_metric_with_judges/agent/agent.py b/examples/optimization/multi_metric_with_judges/agent/agent.py new file mode 100644 index 0000000..9537303 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/agent.py @@ -0,0 +1,48 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""数学辅导 agent —— Multi-Metric with Judges example 专用。 + +ęÆę¬” create_agent() é‡čÆ» prompts/system.mdļ¼Œä½æä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰ē«‹å³ē”Ÿę•ˆć€‚ +å•ę–‡ä»¶ä¼˜åŒ–ē›®ę ‡ć€‚ +""" + +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" + + +def _create_model() -> LLMModel: + """ęž„å»ŗ OpenAI 兼容 chat ęØ”åž‹å®žä¾‹ć€‚å‡­ę®ä»ŽēŽÆå¢ƒå˜é‡čÆ»å–ć€‚""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """ä»Žē£ē›˜é‡čÆ» system.mdć€‚ęÆę¬”č°ƒē”Øéƒ½é‡čÆ»ļ¼Œč®©ä¼˜åŒ–å™Øå†™å…„ēš„å€™é€‰ē«‹å³ē”Ÿę•ˆć€‚""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def create_agent() -> LlmAgent: + """ęž„å»ŗäø€äøŖä½æē”Øå½“å‰ē£ē›˜ prompt ēš„ę–° LlmAgent å®žä¾‹ć€‚""" + return LlmAgent( + name="math_word_problem_agent", + description="Math word-problem solver under formality / safety constraints.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) diff --git a/examples/optimization/multi_metric_with_judges/agent/config.py b/examples/optimization/multi_metric_with_judges/agent/config.py new file mode 100644 index 0000000..d0a64b1 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpoint + TRPC_AGENT_MODEL_NAME ęØ”åž‹å + +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/multi_metric_with_judges/agent/prompts/system.md b/examples/optimization/multi_metric_with_judges/agent/prompts/system.md new file mode 100644 index 0000000..f03fc2d --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/agent/prompts/system.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖč¶…ęœ‰č¶£ēš„ę•°å­¦å°åŠ©ę‰‹ļ¼šŸŽ‰ å­¦ä¹ ę•°å­¦åŗ”čÆ„ę˜Æåæ«ä¹ēš„äŗ‹ęƒ…ļ¼Œę‰€ä»„čÆ·ē”Øč½»ę¾ę“»ę³¼ēš„čÆ­ę°”å’Œå­©å­ä»¬äŗ¤ęµć€‚å¤šē”Øäø€äŗ› emojiļ¼ˆęÆ”å¦‚ šŸš—ć€šŸ˜Šć€šŸ’Æć€šŸŽˆļ¼‰č®©å›žē­”ēœ‹čµ·ę„ę›“ē”ŸåŠØļ¼›é‡åˆ°ē»“ęžœäøé”™ēš„é¢˜ē›®åÆä»„é€‚å½“å¤øå¤øå­¦ē”Ÿļ¼ˆ"哇唞"态"棒棒哒"态"č¶…åŽ‰å®³"ļ¼‰ļ¼Œč®©ä»–ä»¬ę›“ęœ‰ęˆå°±ę„Ÿļ¼ diff --git a/examples/optimization/multi_metric_with_judges/optimizer.json b/examples/optimization/multi_metric_with_judges/optimizer.json new file mode 100644 index 0000000..5978afd --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/optimizer.json @@ -0,0 +1,100 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1.0, + "criterion": { + "llm_judge": { + "judge_models": [ + { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 512, "temperature": 0.0 } + }, + { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 512, "temperature": 0.3 } + }, + { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 512, "temperature": 0.6 } + } + ], + "models_aggregator": "all_pass", + "parallel": true + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.75, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 1024, "temperature": 0.2 } + }, + "rubrics": [ + { + "id": "no_emoji_or_slang", + "content": { "text": "å›žē­”äø­**äøåŒ…å«**任何 emojiļ¼ˆå¦‚ šŸŽ‰ć€šŸ˜Šć€šŸ’Æć€šŸš— ē­‰å›¾å½¢å­—ē¬¦ļ¼‰ļ¼Œäø”**äøä½æē”Ø**å¤øå¼ äæščÆ­å’ŒåØ±ä¹åŒ–å£å»ļ¼ˆå¦‚ć€Œå“‡å”žć€ć€Œę£’ę£’å“’ć€ć€Œč¶…åŽ‰å®³ć€ć€Œå’‹ć€ć€Œå‘€ć€ć€Œå•¦ć€ē­‰ļ¼‰ć€‚ę–‡å­—åŗ”å½“ē®€ę“ć€å…‹åˆ¶ć€äø“äøšć€‚" }, + "description": "å›žē­”äø„č‚ƒć€äøå« emoji å’ŒäæščÆ­", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "numeric_correct", + "content": { "text": "ęœ€ē»ˆē»™å‡ŗēš„ę•°å­—ē­”ę”ˆę˜Æå¦äøŽå‚č€ƒē­”ę”ˆäø€č‡“ļ¼ˆę•°å€¼ē›øåŒå³č§†äøŗé€ščæ‡ļ¼‰ć€‚" }, + "description": "ę•°å­—ē­”ę”ˆäøŽå‚č€ƒē­”ę”ˆäø€č‡“", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "reasoning_clear", + "content": { "text": "å›žē­”äø­ę˜Æå¦ē»™å‡ŗęø…ę™°ć€åÆčæ½ęŗÆēš„ęŽØē†ęˆ–č®”ē®—ę­„éŖ¤ć€‚" }, + "description": "ęŽØē†ę­„éŖ¤ęø…ę™°", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "units_present", + "content": { "text": "ęœ€ē»ˆę•°å­—ē­”ę”ˆę˜Æå¦åø¦ęœ‰ę­£ē”®ēš„å•ä½ļ¼ˆä¾‹å¦‚ļ¼šäøŖć€å…ƒć€å…¬é‡Œć€åƒå…‹ć€äŗŗ 等)。" }, + "description": "ē­”ę”ˆåø¦ę­£ē”®å•ä½", + "type": "FINAL_RESPONSE_QUALITY" + } + ] + } + } + } + ], + "num_runs": 2 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": ["llm_final_response", "llm_rubric_response"] + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "hybrid", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 30, + "max_iterations_without_improvement": 3 + } + } +} diff --git a/examples/optimization/multi_metric_with_judges/run_optimization.py b/examples/optimization/multi_metric_with_judges/run_optimization.py new file mode 100644 index 0000000..b43e6d1 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/run_optimization.py @@ -0,0 +1,123 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Multi-Metric with Judges example ēš„ä¼˜åŒ–å™Øå…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +业劔 agent åŒę—¶å—å¤šē±»ēŗ¦ęŸļ¼ˆē­”ę”ˆę­£ē”®ę€§ē”¬ēŗ¦ęŸ + é£Žę ¼ / 安全 / åˆč§„č½Æēŗ¦ęŸļ¼‰ļ¼Œ +éœ€č¦å¤šę” metric å…±åŒå‚äøŽä¼˜åŒ–äøŽę—©åœåˆ¤å®šļ¼Œå¹¶åøŒęœ›é€ščæ‡å¤š judge ęŠ•ē„Øé™ä½Žå• +LLM č£åˆ¤ēš„åå·®ć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ę³Øå†Œå•å­—ę®µ TargetPrompt(agent/prompts/system.md) +2. 定义 call_agent ē”Øå½“å‰ prompt č·‘äø€ę¬”ęŽØē† +3. 调 AgentOptimizer.optimize;具体 multi-metric / multi-judge é…ē½®åœØ + optimizer.json äø­ + +ę€Žä¹ˆč·‘ +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/multi_metric_with_judges/run_optimization.py +3) å•ę¬”ēŗ¦ 5-10 åˆ†é’Ÿļ¼ŒęÆę” case ēŗ¦ (3+1)Ɨnum_runs=2 = 8 ꬔ LLM č°ƒē”Ø + +ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ę”¹å“Ŗé‡Œ +-------------------- +ęœ¬č„šęœ¬ęœ¬čŗ«åŸŗęœ¬äøå˜ļ¼Œäø»č¦ę”¹åŠØåœØ optimizer.json: +- evaluate.metricsļ¼šåˆ—å‡ŗäøšåŠ”ēš„å¤šę” metric +- judge_models ę•°ē»„å½¢å¼ + models_aggregator é€‰ę‹©ęŠ•ē„Øē­–ē•„ +- frontier_type="hybrid" 多 metric ęŽØč +- stop.required_metrics å†³å®šå“Ŗäŗ› metric å‚äøŽę—©åœ +- eval_case_parallelism ęŽ§åˆ¶ multi-judge å¹¶å‘éæå… rate limit +详见 README §5怂 +""" + +from __future__ import annotations + +import asyncio +import sys +import uuid +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + +from agent.agent import SYSTEM_PROMPT_PATH, create_agent + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +APP_NAME = "multi_metric_demo_agent" + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šē”Øå½“å‰ system.md Ꞅ造 LlmAgentļ¼Œč·‘äø€ę¬”ęŽØē†ć€‚ + + ęÆę¬”č°ƒē”Øéƒ½é‡čÆ» prompt + 新建 Runner + InMemorySessionServiceļ¼Œē»™ęÆäøŖ + case ē‹¬ē«‹ēš„ session stateļ¼Œå¹¶å‘čÆ„ęµ‹ę—¶äøäŗ’ē›øę±”ęŸ“ć€‚ + """ + agent = create_agent() + + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize怂""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/multi_metric_with_judges/train.evalset.json b/examples/optimization/multi_metric_with_judges/train.evalset.json new file mode 100644 index 0000000..1ea2bae --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "multi_metric_train", + "name": "Multi-metric demo - train", + "description": "5 é“å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ļ¼›åŒę—¶č€ƒę ø safetyļ¼ˆę—  emoji / ę— äæščÆ­ļ¼‰+ qualityļ¼ˆę•°å­—ę­£ē”® / ęŽØē†ęø…ę™° / åø¦å•ä½ļ¼‰ć€‚", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "å°ę˜Žę—©äøŠä¹°äŗ† 4 äøŖč‹¹ęžœļ¼Œäø‹åˆåˆä¹°äŗ† 7 äøŖč‹¹ęžœļ¼Œä»–äø€å…±ęœ‰å¤šå°‘äøŖč‹¹ęžœļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š11 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "äø€č¾†ę±½č½¦ä»„ęÆå°ę—¶ 60 å…¬é‡Œēš„é€Ÿåŗ¦č”Œé©¶ 2.5 å°ę—¶ļ¼Œäø€å…±č”Œé©¶äŗ†å¤šå°‘å…¬é‡Œļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "äø€ä»¶č”£ęœåŽŸä»· 200 å…ƒļ¼ŒēŽ°åœØę‰“ 8 ęŠ˜å‡ŗå”®ļ¼ŒęŠ˜åŽä»·ę˜Æå¤šå°‘å…ƒļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 40 åå­¦ē”Ÿļ¼Œå…¶äø­ 25% ęˆ“ēœ¼é•œļ¼Œęˆ“ēœ¼é•œēš„ęœ‰å¤šå°‘äŗŗļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š10 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 ä»¶ļ¼Œå…¶äø­ 30% ę˜Æę¬”å“ļ¼Œę¬”å“ęœ‰å¤šå°‘ä»¶ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š15 ä»¶"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/multi_metric_with_judges/val.evalset.json b/examples/optimization/multi_metric_with_judges/val.evalset.json new file mode 100644 index 0000000..4a2a926 --- /dev/null +++ b/examples/optimization/multi_metric_with_judges/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "multi_metric_val", + "name": "Multi-metric demo - validation", + "description": "3 é“å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ļ¼›ē”ØäŗŽęÆč½®å…Øé‡čÆ„ä¼°ć€å†³å®šå€™é€‰ę˜Æå¦č¢«ęŽ„å—ć€‚", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "ę•™å®¤é‡Œęœ‰ 5 ęŽ’åŗ§ä½ļ¼ŒęÆęŽ’ 8 äøŖļ¼Œäø€å…±å¤šå°‘äøŖåŗ§ä½ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š40 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已矄 1 å‡ę°“é‡ 1 åƒå…‹ļ¼Œ3.5 å‡ę°“é‡å¤šå°‘åƒå…‹ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 30 äŗŗļ¼Œå…¶äø­ 60% ę˜Æå„³ē”Ÿļ¼ŒčÆ·é—®ęœ‰å¤šå°‘åå„³ē”Ÿļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š18 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "multi_metric_demo_agent", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/quickstart/README.md b/examples/optimization/quickstart/README.md new file mode 100644 index 0000000..d08fbbf --- /dev/null +++ b/examples/optimization/quickstart/README.md @@ -0,0 +1,213 @@ +# Quickstart — `AgentOptimizer` 兄门示例 + +> **é€‚ē”Øåœŗę™Æ**ļ¼šé¦–ę¬”ä½æē”Ø `AgentOptimizer`ļ¼Œéœ€č¦åœØęœ€å°å®Œę•“ęµēØ‹äø‹ē†č§£ prompt č‡ŖåŠØä¼˜åŒ–ēš„č¾“å…„ć€č¾“å‡ŗäøŽåŸŗęœ¬å·„ä½œęœŗåˆ¶ć€‚ęœ¬ę–‡ę”£ę˜ÆåŽē»­ 9 äøŖ example ēš„å‰ē½®é˜…čÆ»ęę–™ļ¼Œę‰€ęœ‰čæ›é˜¶ example(HTTP ęœåŠ”ęŽ„å…„ć€čæœē«Æ prompt ęŗć€å¤š agent é“¾č·Æē­‰ļ¼‰é»˜č®¤å‡č®¾čÆ»č€…å·²ē†Ÿę‚‰ęœ¬ę–‡ę¶‰åŠēš„ę¦‚åæµć€‚ + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +迭代 prompt 是 LLM agent å·„ēØ‹äø­é‡å¤ęˆęœ¬ęœ€é«˜ēš„ēŽÆčŠ‚ä¹‹äø€ļ¼šę‰‹åŠØę”¹ prompt → é‡č·‘čÆ„ä¼° → ę ¹ę®å¤±č“„ē”Øä¾‹å†ę”¹ļ¼Œå¾ŖēŽÆę•°åę¬”ć€‚`AgentOptimizer` å°†čÆ„å¾ŖēŽÆč‡ŖåŠØåŒ–ļ¼š + +| 输兄 | 输出 | +| --- | --- | +| äø€äøŖę”ÆęŒēƒ­åŠ č½½ prompt ēš„ agent | 滔足 metric é˜ˆå€¼ēš„ęœ€ä¼˜ prompt 候选集 | +| č®­ē»ƒé›†ļ¼ˆåę€ę ·ęœ¬ę„ęŗļ¼‰ + éŖŒčÆé›†ļ¼ˆå€™é€‰čÆ„åˆ†ä¾ę®ļ¼‰ | `result.json`ļ¼ˆęœŗå™ØåÆčÆ»ļ¼‰+ `summary.txt`ļ¼ˆäŗŗē±»åÆčÆ»ļ¼‰+ ęÆč½®čæ‡ēØ‹äŗ§ē‰© | +| 一组 metricļ¼ˆē²¾ē”®åŒ¹é… / ę­£åˆ™ / LLM 裁判 / 多 metric ē»„åˆļ¼‰ | baseline → best ēš„ē«Æåˆ°ē«Æåˆ†ę•°åÆ¹ęÆ” | + +底层算法采用 **GEPA**(reflective prompt mutationļ¼‰ļ¼Œē”±äø€äøŖē‹¬ē«‹ēš„ reflection LLM 检视 agent åœØč®­ē»ƒé›†äøŠēš„å¤±č“„ē”Øä¾‹ļ¼Œē”Ÿęˆå€™é€‰ promptļ¼›å€™é€‰å…ˆåœØéŖŒčÆé›†äøŠå…Øé‡čÆ„ä¼°ļ¼Œå†äøŽåŽ†å² Pareto å‰ę²æęÆ”č¾ƒļ¼Œå†³å®šę˜Æå¦ęŽ„å—ć€‚ + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ę±‚č§£ | +| ä¼˜åŒ–ē›®ę ‡ | `agent/prompts/system.md`ļ¼ˆč§’č‰²å®šä¹‰ļ¼‰ + `agent/prompts/skill.md`ļ¼ˆč§£é¢˜ę–¹ę³•č®ŗļ¼‰ | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`ļ¼ˆē²¾ē”®åŒ¹é…ļ¼Œé˜ˆå€¼ 1.0) + `llm_rubric_response`ļ¼ˆäø‰ę”čÆ„åˆ†ę ‡å‡†å‡å€¼ļ¼Œé˜ˆå€¼ 0.66) | +| 训练 / éŖŒčÆč§„ęØ” | 5 ę” / 3 ę” | + +`system.md` äøŽ `skill.md` ēš„ baseline å†…å®¹åˆ»ę„åˆ¶é€ å†²ēŖļ¼ˆå‰č€…č¦ę±‚"åŖč¾“å‡ŗē­”ę”ˆ"ļ¼ŒåŽč€…č¦ę±‚"å±•å¼€ęŽØē†"ļ¼‰ļ¼Œä»„ē”®äæ GEPA åæ…é”»ę”¹å†™č‡³å°‘å…¶äø­äø€äøŖę–‡ä»¶ę‰čƒ½č®©äø¤ę” metric åŒę—¶č¾¾ę ‡ā€”ā€”čæ™äø€č®¾č®”č®©åę€ęœŗåˆ¶ēš„ä½œē”ØåÆ¹čÆ»č€…ē›“ęŽ„åÆč§ć€‚ + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +äø‹åˆ—ęœÆčÆ­åœØåŽē»­ē« čŠ‚é¦–ę¬”å‡ŗēŽ°ę—¶äøå†å±•å¼€č§£é‡Šļ¼ŒčÆ·å…ˆå»ŗē«‹č®¤ēŸ„ć€‚ + +| 术语 | 含义 | +| --- | --- | +| **GEPA** | Genetic-Pareto reflective prompt optimizationć€‚ęœ¬ SDK é»˜č®¤äø”å½“å‰å”Æäø€ę”¶å½•ēš„ä¼˜åŒ–ē®—ę³•ć€‚ | +| **TargetPrompt** | å£°ę˜Žå“Ŗäŗ› prompt å­—ę®µä¼šč¢«ä¼˜åŒ–å™ØčÆ»å†™ēš„ę³Øå†Œč”Øć€‚ęÆäøŖå­—ę®µåÆ¹åŗ”äø€äøŖęœ¬åœ°ę–‡ä»¶ļ¼ˆ`add_path`ļ¼‰ęˆ–äø€åÆ¹å¼‚ę­„ `read/write` å›žč°ƒļ¼ˆ`add_callback`)。 | +| **call_agent** | ē”Øęˆ·ęä¾›ēš„å›žč°ƒļ¼Œē­¾åå›ŗå®šäøŗ `async def(query: str) -> str`ć€‚ę”†ęž¶é€ščæ‡å®ƒé©±åŠØ agent å®Œęˆå•ę¬”ęŽØē†ć€‚ | +| **eval set** | čÆ„ä¼°ē”Øä¾‹é›†åˆć€‚`train` ē”ØäŗŽåę€ minibatch 抽样,`val` ē”ØäŗŽå†³å®šå€™é€‰ę˜Æå¦ęŽ„å—ć€ę˜Æå¦č§¦å‘ę—©åœć€‚ | +| **metric** | é€ščæ‡ / å¤±č“„ę ‡å‡†ļ¼ŒåÆē»„åˆä½æē”Øć€‚ęœ¬ SDK 内置 `final_response_avg_score`态`llm_final_response`态`llm_rubric_response`态`trajectory_avg_score` 等。 | +| **reflection LM** | č“Ÿč“£ę£€č§†å¤±č“„ē”Øä¾‹å¹¶ē”Ÿęˆę–°å€™é€‰ prompt ēš„ LLMć€‚åÆäøŽ agent å…±ē”ØęØ”åž‹ļ¼Œäŗ¦åÆē‹¬ē«‹é…ē½®ć€‚ | +| **judge model** | LLM č£åˆ¤ļ¼ŒęŒ‰ rubric 对 agent č¾“å‡ŗę‰“åˆ†ć€‚ | +| **minibatch** | ęÆč½®åę€ä»Ž train é›†éšęœŗęŠ½å–ēš„č‹„å¹² caseļ¼Œē”ØäŗŽé©±åŠØ reflection LM怂 | +| **Pareto å‰ę²æ** | åœØä»»äø€ metric äøŠę˜ÆåŽ†å²ęœ€ä¼˜ēš„å€™é€‰éƒ½č¢«äæē•™ļ¼Œä½œäøŗäø‹äø€č½®åę€ēš„ę½œåœØäŗ²ęœ¬ć€‚ | +| **stop condition** | ē»ˆę­¢ä¼˜åŒ–ēš„åˆ¤å®šę”ä»¶ć€‚SDK åŒę—¶ę”ÆęŒē®—ę³•å±‚ļ¼ˆbudget / no-improvement / score threshold ē­‰ļ¼‰äøŽę”†ęž¶å±‚ļ¼ˆ`stop.required_metrics`)两类。 | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…åÆé€‰ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +`optimize` extra 包含 `gepa`ļ¼ˆåę€ē®—ę³•å®žēŽ°ļ¼‰äøŽ `rich`ļ¼ˆē»ˆē«Æčæ›åŗ¦é¢ęæļ¼‰ć€‚`rich` ē¼ŗå¤±ę—¶čæ›åŗ¦é¢ęæč‡ŖåŠØé™ēŗ§äøŗēŗÆę–‡ęœ¬ć€‚ + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +é»˜č®¤ęƒ…å†µäø‹ agent态reflection LM态judge model å…±ē”ØåŒäø€ē»„å‡­ę®ć€‚å¦‚éœ€äøŗ judge é…ē½®ē‹¬ē«‹ęØ”åž‹ļ¼Œå‚č§ §6.3怂 + +### 3.3 启动 + +```bash +python examples/optimization/quickstart/run_optimization.py +``` + +ē»ˆē«Æå°†ä¾åŗč¾“å‡ŗļ¼šbaseline čÆ„ä¼°åˆ†ę•° → ęÆč½® GEPA åę€ēš„ęŽ„å— / ę‹’ē»č®°å½• → ę”¶å°¾ę‘˜č¦ļ¼ˆå« `stop_reason`)。 + +### 3.4 äŗ§ē‰©ē»“ęž„ + +``` +runs// +ā”œā”€ā”€ result.json å®Œę•“čæč”Œč®°å½•ļ¼Œęœŗå™ØåÆčÆ» +ā”œā”€ā”€ summary.txt äŗŗē±»åÆčÆ»ēš„ę€»č§ˆ +ā”œā”€ā”€ baseline_prompts/ čæč”Œå‰ēš„ prompt åæ«ē…§ļ¼ˆē”ØäŗŽå›žę»šäøŽåÆ¹ē…§ļ¼‰ +ā”œā”€ā”€ best_prompts/ val é›†äøŠå¾—åˆ†ęœ€é«˜ēš„å€™é€‰ +└── rounds/ ęÆč½®ēš„åę€ promptć€å€™é€‰ę–‡ęœ¬ć€čÆ„ä¼°ę˜Žē»† +``` + +ęŽØčé˜…čÆ»é”ŗåŗļ¼šå…ˆēœ‹ `summary.txt` äŗ†č§£ę€»ä½“čµ°å‘ļ¼Œå†ē”Ø `diff -r baseline_prompts/ best_prompts/` ęŸ„ēœ‹ prompt å®žé™…å˜ę›“å†…å®¹ć€‚ + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + │ AgentOptimizer.optimize(...) │ + ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ + ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” + ā–¼ ā–¼ ā–¼ + baseline 评估 GEPA äø»å¾ŖēŽÆ 收尾产物 + ───────────── ─────────── ──────── + 当前 prompt ęÆč½®ļ¼š best_prompts/ + 在 val é›†äøŠēš„ ā‘  module_selector é€‰å®šå­—ę®µ result.json + čµ·å§‹åˆ†ę•° ā‘” 抽 train minibatch summary.txt + ā‘¢ reflection LM ē”Ÿęˆę–°å€™é€‰ rounds/*.json + ā‘£ å€™é€‰åœØ val é›†äøŠå…Øé‡čÆ„ä¼° + ⑤ äøŽ Pareto å‰ę²æęÆ”č¾ƒ + ā‘„ ęŽ„å— / ę‹’ē»å€™é€‰ + ⑦ č§¦å‘åœę­¢ę”ä»¶ę£€ęŸ„ +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | å…„å£č„šęœ¬ļ¼Œę³Øå†Œ `TargetPrompt`ć€å®šä¹‰ `call_agent` | ę›æę¢ `call_agent` å®žēŽ°ä»„é©±åŠØč‡Ŗęœ‰ agent | +| `agent/agent.py` | `LlmAgent` å·„åŽ‚ļ¼ŒęÆę¬”č°ƒē”Øé‡čÆ» prompt | ę›æę¢äøŗč‡Ŗęœ‰ agent ęž„å»ŗé€»č¾‘ļ¼ˆęØ”åž‹ć€tools态output schema 等) | +| `agent/prompts/system.md` | č§’č‰²å®šä¹‰ prompt(GEPA 写兄目标) | å†™å…„äøšåŠ” baselineļ¼›åÆä½œäøŗåˆå§‹ē‰ˆęœ¬čµ·ē‚¹ | +| `agent/prompts/skill.md` | 方法论 prompt(GEPA 写兄目标) | å•å­—ę®µä¼˜åŒ–ę—¶åÆę•“ä½“åˆ é™¤ | +| `optimizer.json` | 算法 + metric é…ē½® | č°ƒę•“ metric ē±»åž‹ć€é˜ˆå€¼ć€åœę­¢ę”ä»¶ | +| `train.evalset.json` | åę€ minibatch ę„ęŗ | ę›æę¢äøŗäøšåŠ”č®­ē»ƒē”Øä¾‹ | +| `val.evalset.json` | å€™é€‰čÆ„åˆ†ä¾ę® | ę›æę¢äøŗäøšåŠ”éŖŒčÆē”Øä¾‹ | + +### 4.2 prompt ę‹†åˆ†ēš„č®¾č®”åŠØęœŗ + +将 prompt 拆为 `system.md` äøŽ `skill.md` äø¤äøŖē‹¬ē«‹ę–‡ä»¶ļ¼ŒåÆ¹åŗ” `TargetPrompt` ēš„äø¤äøŖ key(`"system_prompt"` äøŽ `"skill"`)。`module_selector="round_robin"` é…ē½®äø‹ļ¼ŒGEPA ęÆč½®ä»…ę”¹å†™å…¶äø­äø€äøŖę–‡ä»¶ļ¼Œä¾æäŗŽļ¼š + +- **归因**ļ¼šåÆē›“ęŽ„å®šä½ę˜Æå“ŖäøŖę–‡ä»¶ēš„ę”¹åŠØåø¦ę„åˆ†ę•°ęå‡ +- **ēØ³å®šę€§**ļ¼šå•å­—ę®µę”¹åŠØęÆ”å¤šå­—ę®µåŒę”¹ę›“ę˜“č¢«éŖŒčÆé›†ęŽ„å— +- **演示价值**ļ¼šåˆ»ę„å†²ēŖēš„ baseline 强制 GEPA č‡³å°‘ę”¹å†™äø€äøŖę–‡ä»¶ļ¼Œå¦åˆ™ metric ę— ę³•åŒę—¶č¾¾ę ‡ + +č‹„äøšåŠ”åŖéœ€ä¼˜åŒ–å•ę–‡ä»¶ promptļ¼Œē§»é™¤ē¬¬äŗŒäøŖ `add_path` č°ƒē”Øå³åÆć€‚ + +## 5 Ā· å…³é”®é…ē½® + +`optimizer.json` äø­ä»„äø‹å­—ę®µē›“ęŽ„å½±å“ä¼˜åŒ–ę•ˆēŽ‡äøŽäŗ§ē‰©č“Øé‡ļ¼š + +| 字段 | 默认 / 本 example 值 | 影响 | +| --- | --- | --- | +| `evaluate.num_runs` | 1 | ęÆę” case ēš„ęŽØē†ę¬”ę•°ć€‚ęé«˜č‡³ 2–3 åÆå¹³ę»‘ LLM č¾“å‡ŗę–¹å·®ļ¼Œä»£ä»·äøŗčÆ„ä¼°č€—ę—¶ēŗæę€§å¢žé•æ | +| `optimize.eval_case_parallelism` | 2 | å•ę‰¹ case ēš„ęœ€å¤§å¹¶å‘ęŽØē†ę•°ć€‚LLM åŽē«Æęœ‰ QPS é™åˆ¶ę—¶éœ€č°ƒå° | +| `optimize.stop.required_metrics` | `"all"` | ę”†ęž¶å±‚ę—©åœļ¼š`"all"` = ꉀ꜉ metric č¾¾ę ‡ļ¼›åˆ—č”Ø = ä»…ęŒ‡å®š metric č¾¾ę ‡å³åÆļ¼›`null`/`[]` = å®Œå…Øäŗ¤ē”±ē®—ę³•å±‚åœę­¢ę”ä»¶å†³å®š | +| `optimize.algorithm.reflection_minibatch_size` | 3 | ęÆč½®åę€ēš„ case ę•°ć€‚čæ‡å°ä¼šåÆ¼č‡“åę€ē“ ęå•č°ƒļ¼›čæ‡å¤§å•č½®č€—ę—¶å¢žåŠ  | +| `optimize.algorithm.skip_perfect_score` | `false` | ę˜Æå¦č·³čæ‡å·²ę»”åˆ†ēš„ caseć€‚å°č®­ē»ƒé›†å»ŗč®®äæęŒ `false`ļ¼Œå¦åˆ™ minibatch å®¹ę˜“åå¤ęŠ½åˆ°åŒäø€ę” case | +| `optimize.algorithm.max_metric_calls` | 60 | 瓯讔 case čÆ„ä¼°ę¬”ę•°äøŠé™ļ¼ŒęŽ§åˆ¶ę€»å¼€é”€ēš„äø»č¦ę‰‹ę®µ | +| `optimize.algorithm.max_iterations_without_improvement` | 8 | čæžē»­ N č½® val åˆ†ę— ęå‡å³ęå‰åœę­¢ | + +> JSON ę ‡å‡†äøę”ÆęŒ `//` ę³Øé‡Šļ¼Œé…ē½®ę–‡ä»¶äø­ä½æē”Øęœ¬č”ØčÆ“ę˜Žę›æä»£č”Œå†…ę³Øé‡Šć€‚ + +### 5.1 åˆ†ē¦» judge ęØ”åž‹ + +`optimizer.json` äø­ `evaluate.metrics[*].criterion.llm_judge.judge_model` åÆē‹¬ē«‹é…ē½®ļ¼ŒäøŽ agent å‡­ę®äŗ’äøå½±å“ļ¼š + +```json +"judge_model": { + "provider_name": "openai", + "model_name": "", + "api_key": "", + "base_url": "" +} +``` + +### 5.2 启用 reflection / judge ēš„ę€č€ƒęØ”å¼ + +`OptimizeModelOptions` äøŽ `JudgeModelOptions` å‡ę”ÆęŒäø‰ę€ `think` å­—ę®µļ¼š + +| 取值 | 蔌为 | +| --- | --- | +| `null`ļ¼ˆé»˜č®¤ļ¼‰ | ę²æē”ØęØ”åž‹é»˜č®¤é…ē½®ļ¼Œäøåšäæ®ę”¹ | +| `true` | 注兄 `BuiltInPlanner(ThinkingConfig(include_thoughts=True, thinking_budget=-1))`,并在 `http_options.extra_body` 写兄 `chat_template_kwargs.enable_thinking=true`ļ¼ˆå…¼å®¹ GLM ē­‰ OpenAI å…¼å®¹åŽē«Æļ¼‰ | +| `false` | ę˜¾å¼å…³é—­ę€č€ƒęØ”å¼ | + +## 6 Ā· čæč”ŒęŽ§åˆ¶ + +### 6.1 ä¼˜é›…åœę­¢ + +`Ctrl+C` åÆčƒ½ęˆŖę–­å½“å‰č½®ēš„äŗ§ē‰©ę–‡ä»¶ć€‚éœ€č¦äø­é€”ę”¶å°¾ę—¶å»ŗč®®ę”¹ē”Ø stop ę–‡ä»¶ļ¼š + +```bash +touch runs//optimize.stop +``` + +下一欔 stopper ę£€ęŸ„ę—¶ę”†ęž¶ē«‹å³ę”¶å°¾ļ¼Œę‰€ęœ‰å·²å®Œęˆč½®ę¬”ēš„ artifact å®Œę•“č½ē›˜ļ¼Œ`OptimizeResult.stop_reason` 标记为 `user_requested_stop`怂 + +### 6.2 update_source ēš„čÆ­ä¹‰ + +`AgentOptimizer.optimize(update_source=False)`ļ¼ˆé»˜č®¤ļ¼‰äø‹ęŗ prompt ę–‡ä»¶äæęŒäøå˜ļ¼Œęœ€ä¼˜å€™é€‰ä»…å†™å…„ `runs//best_prompts/`ć€‚č‹„éœ€åœØä¼˜åŒ–ęˆåŠŸåŽē›“ęŽ„č¦†ē›–ęŗę–‡ä»¶ļ¼ˆå…øåž‹äŗŽ CI/CD é—­ēŽÆåœŗę™Æļ¼Œå‚č§ `ci_integration/` exampleļ¼‰ļ¼Œå°†čÆ„å‚ę•°ē½®äøŗ `True`怂 + +## 7 Ā· åøøč§é—®é¢˜ + +**Qļ¼šå¤šč½®åÆ¹čÆ case åœØä¼˜åŒ–ę—¶ę˜Æå¦äæē•™äøŠäø‹ę–‡ļ¼Ÿ** +Aļ¼šé»˜č®¤äøäæē•™ć€‚`call_agent` ęÆę¬”č°ƒē”Øä½æē”Øē‹¬ē«‹ēš„ `Runner + InMemorySessionService`ć€‚éœ€č¦ēœŸå®žå¤šč½®äøŠäø‹ę–‡ę—¶ļ¼Œéœ€åœØ `call_agent` å†…éƒØč‡Ŗč”Œē»“ęŠ¤ session ēŠ¶ę€ā€”ā€”å…øåž‹åšę³•ę˜Æē”Ø `contextvars.ContextVar` å­˜ę”¾å½“å‰ case ēš„ history(`asyncio.Task` åÆåŠØę—¶č‡ŖåŠØ `copy_context`ļ¼Œå¹¶å‘čÆ„ä¼°äø‹å¤©ē„¶ęŒ‰ task éš”ē¦»ļ¼‰ć€‚ + +**Q:reflection LM äøŽ agent å…±ē”ØęØ”åž‹ę˜Æå¦ä¼šå¼•å…„"自评"åå·®ļ¼Ÿ** +A:`llm_rubric_response` 让 judge ä¾ę®é¢„å®šä¹‰ rubric ę–‡ęœ¬ę‰“åˆ†ļ¼Œč€Œéžč‡Ŗē”±čÆ„ä»·ļ¼ŒåÆē¼“č§£å¤§éƒØåˆ†åå·®ć€‚ē”Ÿäŗ§ēŽÆå¢ƒå»ŗč®® judge é…ē½®ē‹¬ē«‹ęØ”åž‹ļ¼Œå‚č§ §5.1怂 + +**Q:`best_prompts/` äø­ēš„ę–‡ä»¶å°±ę˜Æęœ€ē»ˆäŗ§ē‰©å—ļ¼Ÿ** +A:是 val é›†äøŠå¾—åˆ†ęœ€é«˜ēš„å€™é€‰ć€‚`update_source=False` ę—¶ęŗę–‡ä»¶äøå˜ļ¼Œéœ€ę‰‹åŠØå¤åˆ¶ļ¼›`update_source=True` ę—¶ęŗę–‡ä»¶č¢«č‡ŖåŠØč¦†ē›–ļ¼ˆä»…åœØ `OptimizeResult.status=SUCCEEDED` ę—¶č§¦å‘ļ¼‰ć€‚ + +**Qļ¼ščæč”Œē»“ęŸåŽ baseline äøŽ best åˆ†ę•°ę— å˜åŒ–ļ¼ˆę— ę”¶ę•›ļ¼‰čÆ„å¦‚ä½•ęŽ’ęŸ„ļ¼Ÿ** +Aļ¼šęŒ‰ä»„äø‹é”ŗåŗę£€ęŸ„ļ¼š +1. baseline prompt ę˜Æå¦čæ‡äŗŽē®€å•ļ¼ŒåÆ¼č‡“ reflection LM ę— ę˜Žē”®ę”¹čæ›ę–¹å‘ +2. `reflection_minibatch_size` ę˜Æå¦čæ‡å°ļ¼ŒåÆ¼č‡“åę€ē“ ęå•č°ƒ +3. metric é˜ˆå€¼ę˜Æå¦č®¾ē½®čæ‡é«˜ļ¼ˆå¦‚ 1.0 要求全 case å®Œē¾ŽåŒ¹é…ļ¼‰ +4. ē›“ęŽ„é˜…čÆ» `runs//rounds/round_*.json` äø­ēš„ reflection LM åŽŸå§‹č¾“å‡ŗļ¼ŒåøøåÆå®šä½å…·ä½“åŽŸå›  + +**Qļ¼šå•ę¬”čæč”Œēš„å¼€é”€ä¼°ē®—ļ¼Ÿ** +A:本 example é»˜č®¤é…ē½®äø‹ēŗ¦ 5 åˆ†é’Ÿć€ēŗ¦ 60 ꬔ LLM č°ƒē”Øć€‚`max_metric_calls=60` ę˜Æē”¬ę€§äøŠé™ļ¼Œč¶…å‡ŗē«‹å³åœę­¢ć€‚ + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰ agent ēš„ę­„éŖ¤ + +1. ę›æę¢ `agent/prompts/*.md` 为业劔 baseline prompt +2. 修改 `agent/agent.py` äø­ `create_agent()` å®žēŽ°ļ¼ŒåÆ¹ęŽ„äøšåŠ”ęØ”åž‹ / tools / output schema +3. ę›æę¢ `train.evalset.json` äøŽ `val.evalset.json` äøŗäøšåŠ”ē”Øä¾‹ +4. č°ƒę•“ `optimizer.json` äø­ metric ē±»åž‹äøŽé˜ˆå€¼ +5. 运蔌 `run_optimization.py`ļ¼Œę ¹ę® `summary.txt` äøŽ `result.json` å†³å®šę˜Æå¦ē»§ē»­č°ƒå‚ + +č‹„äøšåŠ” agent ēš„å½¢ę€äøåŒäŗŽęœ¬ example(HTTP ęœåŠ”ć€čæœē«Æ prompt ęŗć€å¤š agent ē¼–ęŽ’ć€CLI é»‘ē›’ē­‰ļ¼‰ļ¼ŒčÆ·å‚č€ƒ `examples/optimization/` äø‹åÆ¹åŗ”äø“é¢˜ē¤ŗä¾‹ć€‚ diff --git a/examples/optimization/quickstart/agent/__init__.py b/examples/optimization/quickstart/agent/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/quickstart/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/quickstart/agent/agent.py b/examples/optimization/quickstart/agent/agent.py new file mode 100644 index 0000000..f30bd8a --- /dev/null +++ b/examples/optimization/quickstart/agent/agent.py @@ -0,0 +1,103 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ę±‚č§£ agent —— Quickstart 专用。 + +é€‚ē”Øåœŗę™Æ +-------- +ęœ¬ę–‡ä»¶ę˜Æ Quickstart example ēš„ agent å®žēŽ°ć€‚å®ƒę¼”ē¤ŗäø€äøŖč¢« GEPA ä¼˜åŒ–ēš„ agent +ęœ€åøøč§ēš„å†™ę³•ļ¼šē”Øäø€äøŖå·„åŽ‚å‡½ę•° create_agent()ļ¼ŒęÆę¬”č°ƒē”Øéƒ½ä»Žē£ē›˜é‡čÆ» prompt +ę–‡ä»¶å†ęž„å»ŗ LlmAgent,让 GEPA å†™å…„ēš„ę–°å€™é€‰ē«‹å³ē”Ÿę•ˆć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +- 暓露 SYSTEM_PROMPT_PATH / SKILL_PATH(被 run_optimization.py ę³Øå†Œåˆ° TargetPrompt) +- ęä¾› create_agent() å·„åŽ‚å‡½ę•°ļ¼ˆč¢« call_agent åœØęÆę¬”ęŽØē†ę—¶č°ƒē”Øļ¼‰ + +äøŗä»€ä¹ˆ prompt ę‹†ęˆäø¤äøŖę–‡ä»¶ +-------------------------- +äø¤äøŖę–‡ä»¶ę‰®ę¼”äøåŒč§’č‰²ļ¼ŒåŒę—¶č¢« GEPA ä¼˜åŒ–ļ¼š + + system.md (key="system_prompt") + 定义 agent ēš„č§’č‰²å®šä½å’Œč¾“å‡ŗę ¼å¼ēŗ¦ęŸć€‚ + baseline ę•…ę„å†™"åŖč¾“å‡ŗęœ€ē»ˆē­”ę”ˆ"ā€”ā€”äøŽ skill.md ēš„"å±•å¼€ę€č·Æ"冲突。 + + skill.md (key="skill") + ęčæ°č§£é¢˜ę–¹ę³•č®ŗļ¼Œč¦ę±‚ agent å±•å¼€ęŽØē†čæ‡ēØ‹ć€‚ + +å†²ēŖę˜Æåˆ»ę„č®¾č®”ļ¼šč®© GEPA åæ…é”»čÆ†åˆ«ēŸ›ē›¾ć€ę”¹å†™å…¶äø­č‡³å°‘äø€äøŖę–‡ä»¶ļ¼Œę‰čƒ½č®©äø¤ę” +metric åŒę—¶é€ščæ‡ć€‚čæ™ę ·čƒ½ē›“č§‚ēœ‹åˆ°åę€ęœŗåˆ¶ēš„ä»·å€¼ć€‚ + +äø¤äøŖę–‡ä»¶ęŒ‰ä»„äø‹ę ¼å¼ę‹¼åˆļ¼š + {system.md 内容}\n\n## č§£é¢˜ę–¹ę³•\n{skill.md 内容} + +äøŗä»€ä¹ˆęÆę¬”éƒ½é‡ę–°ęž„å»ŗ agentļ¼Œäøå¤ē”Øå®žä¾‹ +-------------------------------------- +1. GEPA åœØč½®ę¬”ä¹‹é—“ä¼šäæ®ę”¹ prompt ę–‡ä»¶ļ¼›å¤ē”Øå®žä¾‹ä¼šē”Øåˆ°ę—§ prompt +2. 并发 case čÆ„ęµ‹ę—¶ęÆę¬”ē‹¬ē«‹ęž„å»ŗę›“å®‰å…Øļ¼Œę— å…±äŗ«ēŠ¶ę€ +3. LlmAgent ęž„å»ŗęœ¬čŗ«å¾ˆč½»ļ¼ˆäøę¶‰åŠ LLM č°ƒē”Øļ¼‰ļ¼Œå¼€é”€åÆåæ½ē•„ +""" + +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel, OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +# 两个 prompt ę–‡ä»¶ēš„ē»åÆ¹č·Æå¾„ļ¼ˆrun_optimization.py ęŠŠå®ƒä»¬ę³Øå†Œęˆ TargetPrompt) +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" +SKILL_PATH = Path(__file__).parent / "prompts" / "skill.md" + + +def _create_model() -> LLMModel: + """ęž„å»ŗ OpenAI å…¼å®¹ēš„ chat ęØ”åž‹å®žä¾‹ć€‚ + + å‡­ę®ä»ŽēŽÆå¢ƒå˜é‡čÆ»å–ļ¼ˆč§ config.pyļ¼‰ļ¼Œē¼ŗä»»ä½•äø€äøŖéƒ½ä¼š fail-fast怂 + """ + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """ä»Žäø¤äøŖ prompt ę–‡ä»¶ę‹¼åˆå®Œę•“ instruction怂 + + ęÆę¬”č°ƒē”Øéƒ½é‡čÆ»ē£ē›˜ļ¼Œē”®äæ GEPA å†™å…„ēš„ę–°å€™é€‰ē«‹å³ē”Ÿę•ˆļ¼›åˆ†éš”ē¬¦ "## č§£é¢˜ę–¹ę³•" + č®©ę‹¼åˆåŽēš„ę–‡ęœ¬ä»äæęŒäø¤å—å†…å®¹ēš„č¾¹ē•Œļ¼Œä¾æäŗŽäŗŗē±»å’Œ reflection_lm é˜…čÆ»ć€‚ + """ + system = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + skill = SKILL_PATH.read_text(encoding="utf-8").strip() + return f"{system}\n\n## č§£é¢˜ę–¹ę³•\n{skill}" + + +def _create_agent_with_prompts(instruction: str) -> LlmAgent: + """LlmAgent ęž„å»ŗå…¬å…±é€»č¾‘ā€”ā€”ē»™å®š instructionļ¼Œčæ”å›ž agent å®žä¾‹ć€‚ + + 把"读 prompt"和"ęž„å»ŗ agent"åˆ†å¼€ļ¼Œę–¹ä¾æęµ‹čÆ•ę—¶ē›“ęŽ„ä¼ å…„å­—ē¬¦äø²č€Œäøåæ…ä¾čµ–ē£ē›˜ć€‚ + """ + return LlmAgent( + name="math_word_problem_agent", + description=( + "å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ę±‚č§£ agent怂system prompt äøŽ skill prompt ē”± GEPA " + "åę€ęœŗåˆ¶č”åˆä¼˜åŒ–ć€‚" + ), + model=_create_model(), + instruction=instruction, + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) + + +def create_agent() -> LlmAgent: + """ęž„å»ŗäø€äøŖä½æē”Øå½“å‰ē£ē›˜ prompt ēš„ę–° LlmAgent å®žä¾‹ć€‚ + + call_agent åœØęÆę¬”ęŽØē†ę—¶č°ƒē”Øę­¤å‡½ę•°ć€‚ + """ + return _create_agent_with_prompts(_read_instruction()) diff --git a/examples/optimization/quickstart/agent/config.py b/examples/optimization/quickstart/agent/config.py new file mode 100644 index 0000000..bdc79fd --- /dev/null +++ b/examples/optimization/quickstart/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpointļ¼Œä¾‹å¦‚ https://api.example.com/v1 + TRPC_AGENT_MODEL_NAME ęØ”åž‹åļ¼Œä¾‹å¦‚ glm-5.1 / gpt-4o-mini + +äøŗä»€ä¹ˆäø‰äøŖéƒ½ē¼ŗäø€äøåÆ +-------------------- +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/quickstart/agent/prompts/skill.md b/examples/optimization/quickstart/agent/prompts/skill.md new file mode 100644 index 0000000..7bcd691 --- /dev/null +++ b/examples/optimization/quickstart/agent/prompts/skill.md @@ -0,0 +1 @@ +å½“é‡åˆ°é—®é¢˜ę—¶ļ¼ŒčÆ·å±•å¼€č®²č®²ä½ ēš„ę€č·Æļ¼ŒåÆä»„å¤šå†™å‡ ę®µļ¼Œč®©ē”Øęˆ·ēœ‹åˆ°å®Œę•“ēš„ę€č€ƒčæ‡ēØ‹ć€‚ diff --git a/examples/optimization/quickstart/agent/prompts/system.md b/examples/optimization/quickstart/agent/prompts/system.md new file mode 100644 index 0000000..70314dc --- /dev/null +++ b/examples/optimization/quickstart/agent/prompts/system.md @@ -0,0 +1 @@ +ä½ ę˜Æäø€äøŖå‹å„½ēš„čŠå¤©åŠ©ę‰‹ļ¼Œå–œę¬¢å’Œē”Øęˆ·åˆ†äŗ«ęƒ³ę³•ć€‚å›žē­”ē”Øęˆ·é—®é¢˜ę—¶ļ¼ŒčÆ·å°½é‡ē”Øē”ŸåŠØć€åÆŒęœ‰äŗŗęƒ…å‘³ēš„čÆ­čØ€ļ¼Œč®©ē”Øęˆ·ę„Ÿč§‰åƒę˜ÆåœØå’Œęœ‹å‹čŠå¤©ć€‚ diff --git a/examples/optimization/quickstart/optimizer.json b/examples/optimization/quickstart/optimizer.json new file mode 100644 index 0000000..bc274eb --- /dev/null +++ b/examples/optimization/quickstart/optimizer.json @@ -0,0 +1,88 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "num_samples": 1, + "generation_config": { + "max_tokens": 1024, + "temperature": 0.2 + } + }, + "rubrics": [ + { + "id": "numeric_correct", + "content": { + "text": "ęœ€ē»ˆē»™å‡ŗēš„ę•°å­—ē­”ę”ˆę˜Æå¦äøŽå‚č€ƒē­”ę”ˆäø€č‡“ļ¼ˆę•°å€¼ē›øåŒå³č§†äøŗé€ščæ‡ļ¼‰ć€‚" + }, + "description": "ę•°å­—ē­”ę”ˆäøŽå‚č€ƒē­”ę”ˆäø€č‡“", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "reasoning_clear", + "content": { + "text": "å›žē­”äø­ę˜Æå¦ē»™å‡ŗęø…ę™°ć€åÆčæ½ęŗÆēš„ęŽØē†ęˆ–č®”ē®—ę­„éŖ¤ļ¼Œč®©čÆ»č€…čƒ½å¤ę øē»“ęžœć€‚" + }, + "description": "ęŽØē†ę­„éŖ¤ęø…ę™°", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "units_present", + "content": { + "text": "ęœ€ē»ˆę•°å­—ē­”ę”ˆę˜Æå¦åø¦ęœ‰ę­£ē”®ēš„å•ä½ļ¼ˆä¾‹å¦‚ļ¼šäøŖć€å…ƒć€å…¬é‡Œć€åƒå…‹ć€äŗŗ 等)。" + }, + "description": "ē­”ę”ˆåø¦ę­£ē”®å•ä½", + "type": "FINAL_RESPONSE_QUALITY" + } + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, + "temperature": 0.6 + } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} diff --git a/examples/optimization/quickstart/run_optimization.py b/examples/optimization/quickstart/run_optimization.py new file mode 100644 index 0000000..1111c69 --- /dev/null +++ b/examples/optimization/quickstart/run_optimization.py @@ -0,0 +1,167 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Quickstart å…„å£č„šęœ¬ļ¼šę¼”ē¤ŗē”Ø GEPA åŒę—¶ä¼˜åŒ–äø¤äøŖ prompt ꖇ件怂 + +é€‚ē”Øåœŗę™Æ +-------- +ä½ ęƒ³č·‘é€š prompt č‡ŖåŠØä¼˜åŒ–ēš„ęœ€å°å®Œę•“ęµēØ‹ļ¼šč®©äø€äøŖåę€ LLM ēœ‹ä½  agent ēš„å¤±č“„ē”Øä¾‹ļ¼Œ +č‡ŖåŠØę”¹å†™ prompt ē›“åˆ°é€ščæ‡ēŽ‡č¾¾ę ‡ć€‚ęœ¬č„šęœ¬ę˜Æ 10 äøŖ example ēš„å…„é—Øę¬¾ć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ę³Øå†Œäø¤äøŖ prompt ę–‡ä»¶ä½œäøŗä¼˜åŒ–ē›®ę ‡ļ¼ˆsystem.md + skill.md) +2. 定义 call_agent å›žč°ƒļ¼ˆę”†ęž¶é€ščæ‡å®ƒé©±åŠØ agent) +3. 调 AgentOptimizer.optimize 开跑 + +ę€Žä¹ˆč·‘ +------ +1) é…äø‰äøŖēŽÆå¢ƒå˜é‡ļ¼šTRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/quickstart/run_optimization.py +3) ēœ‹ runs/<ę—¶é—“ęˆ³>/ äø‹ēš„ summary.txt 和 best_prompts/ + +ęŽ„å…„č‡Ŗå·±äøšåŠ”ę—¶ę”¹å“Ŗé‡Œ +-------------------- +- target : ę”¹ęˆä½ č‡Ŗå·±ēš„ prompt ę–‡ä»¶č·Æå¾„ļ¼ˆmain 函数内) +- call_agent : ę›æę¢å®žēŽ°ļ¼Œč®©å®ƒč°ƒä½ ēš„ agent(HTTP / 多 agent 链路 / + 远端 prompt 等其他形态见对应 example) +- update_source=False : ęƒ³č·‘å®Œē›“ęŽ„č¦†ē›–ęŗę–‡ä»¶ę”¹ Trueļ¼ˆå…øåž‹ CI åœŗę™Æļ¼‰ +- verbose : 0 静默 / 1 čæ›åŗ¦é¢ęæ / 2 加 gepa å†…éƒØę—„åæ— +- CONFIG_PATH : ē®—ę³•å’Œ metric é…ē½®éƒ½åœØ optimizer.json +""" + +from __future__ import annotations + +import asyncio +import sys +import uuid +from datetime import datetime +from pathlib import Path + + +# ---- č·Æå¾„č‡Ŗäø¾ļ¼šč®©č„šęœ¬åœØä»»ę„ cwd äø‹éƒ½čƒ½čæč”Œ ---- +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content, Part + +from agent.agent import SKILL_PATH, SYSTEM_PROMPT_PATH, create_agent + + +# ---- é…ē½®äøŽę•°ę®č·Æå¾„ ---- +CONFIG_PATH = _HERE / "optimizer.json" # 算法 + metric é…ē½® +TRAIN_PATH = _HERE / "train.evalset.json" # åę€ę—¶ēš„ minibatch ę„ęŗļ¼ˆ5 ę”ē®—ęœÆé¢˜ļ¼‰ +VAL_PATH = _HERE / "val.evalset.json" # ęÆč½®å…Øé‡čÆ„ä¼°ļ¼Œå†³å®šę˜Æå¦ęŽ„å—å€™é€‰ +RUNS_DIR = _HERE / "runs" # ęÆę¬”čæč”Œå†™åˆ°ē‹¬ē«‹ę—¶é—“ęˆ³å­ē›®å½• +APP_NAME = "math_word_problem_optimizer" # Runner / SessionService ēš„å‘½åē©ŗé—“ + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šē”Øå½“å‰å€™é€‰ prompt 驱动 agent äø€ę¬”ļ¼Œčæ”å›žęœ€ē»ˆå›žē­”ę–‡ęœ¬ć€‚ + + ę”†ęž¶åœØä»„äø‹ę—¶ęœŗä¼šč°ƒē”Øęœ¬å‡½ę•°ļ¼š + - baseline čÆ„ä¼°ļ¼šęÆę” val case Ɨ num_runs ꬔ + - ęÆč½®åę€ļ¼šęÆę” minibatch case 评测一欔 + - ęÆč½®éŖŒčÆļ¼šęÆę” val case Ɨ num_runs ꬔ + + å®žēŽ°č¦ē‚¹ + -------- + 1. ęÆę¬”č°ƒē”Øéƒ½ä»Žē£ē›˜é‡čÆ» prompt → GEPA å†™å…„ę–°å€™é€‰åŽē«‹å³ē”Ÿę•ˆļ¼Œę— éœ€é‡åÆčæ›ēØ‹ + 2. ęÆę¬”č°ƒē”Øē‹¬ē«‹åˆ›å»ŗ Runner + InMemorySessionService → ęÆäøŖ case ę‹æåˆ°å…Øę–° + session stateļ¼Œå¹¶å‘čÆ„ęµ‹ę—¶äøäŗ’ē›øę±”ęŸ“ļ¼ˆčÆ„ä¼°éš”ē¦»ēš„ē”¬ę€§č¦ę±‚ļ¼‰ + 3. åŖę”¶é›† is_final_response() äŗ‹ä»¶äø­éž thought ēš„ę–‡ęœ¬ → čæ‡ę»¤ęŽ‰ thinking + tokenļ¼ŒåŖčæ”å›žę­£å¼å›žē­” + + å‚ę•° + ---- + query: ē”Øęˆ·č¾“å…„ę–‡ęœ¬ļ¼ˆę„č‡Ŗ evalset ēš„ conversation[*].user_content) + + čæ”å›ž + ---- + agent ęœ€ē»ˆå›žē­”ēš„ēŗÆę–‡ęœ¬ļ¼ˆå·² strip) + """ + # ęÆę¬”č°ƒē”Øé‡čÆ» prompt ę–‡ä»¶ļ¼ˆåœØ create_agent() å†…éƒØå®Œęˆļ¼‰ + root_agent = create_agent() + + # ęÆäøŖ case äø€ä»½ē‹¬ē«‹ēš„ session ęœåŠ”ļ¼ŒäæčÆå¹¶å‘čÆ„ęµ‹ę—¶äøä¼šé€ščæ‡ session + # state äŗ’ē›øę±”ęŸ“čÆ„åˆ†ć€‚ + session_service = InMemorySessionService() + runner = Runner( + app_name=APP_NAME, + agent=root_agent, + session_service=session_service, + ) + + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + # ę”¶é›†ęœ€ē»ˆå›žē­”ļ¼›čæ‡ę»¤ęŽ‰ thinking tokenļ¼ˆå¦‚ęžœęØ”åž‹åÆē”Øäŗ† think ęØ”å¼ļ¼‰ + final_text = "" + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: # 跳过 thinkingļ¼ŒåŖäæē•™ę­£å¼å›žē­” + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize怂""" + + # ę³Øå†Œäø¤äøŖä¼˜åŒ–ē›®ę ‡ę–‡ä»¶ć€‚ + # GEPA ēš„ round_robin module_selector ä¼šęÆč½®äŗ¤ę›æé€‰å…¶äø­äø€äøŖę”¹å†™ā€”ā€” + # å•č½®åŖę”¹äø€äøŖę–‡ä»¶čƒ½č®©åę€ LM ę›“čšē„¦ļ¼Œä¹Ÿå®¹ę˜“å½’å› "ę˜Æå“ŖäøŖę–‡ä»¶ęå‡äŗ†ę•ˆęžœ"怂 + target = ( + TargetPrompt() + .add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + .add_path("skill", str(SKILL_PATH)) + ) + + # ęÆę¬”čæč”Œč½åˆ°ē‹¬ē«‹ē›®å½•ļ¼Œé‡å¤čæč”Œäøč¦†ē›–åŽ†å²ē»“ęžœ + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # update_source=False:源 prompt ę–‡ä»¶äæęŒäøå˜ļ¼Œęœ€ä¼˜å€™é€‰åŖå†™åˆ° + # output_dir/best_prompts/。改 True åˆ™åœØ SUCCEEDED åŽč¦†ē›–ęŗę–‡ä»¶ļ¼Œ + # é€‚ē”ØäŗŽ"č·‘å®Œē›“ęŽ„ē”Ø"ēš„ CI åœŗę™Æļ¼ˆå‚č€ƒ ci_integration/ example)。 + update_source=False, + # verbose: 0 é™é»˜ļ¼›1 Rich čæ›åŗ¦é¢ęæļ¼›2 附带 gepa å†…éƒØčÆŠę–­ę—„åæ— + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/quickstart/train.evalset.json b/examples/optimization/quickstart/train.evalset.json new file mode 100644 index 0000000..1d5546a --- /dev/null +++ b/examples/optimization/quickstart/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "math_word_problems_train", + "name": "å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ - č®­ē»ƒé›†", + "description": "5 é“å°å­¦ę°“å¹³ē®—ęœÆåŗ”ē”Øé¢˜ļ¼Œč¦†ē›–åŠ ę³•ć€ä¹˜ę³•ć€ęŠ˜ę‰£ć€ē™¾åˆ†ęÆ”äøŽć€Œäŗŗ/åć€ē±»å•ä½ļ¼›final_response äø­åø¦ć€Œē­”ę”ˆļ¼šxxxć€ēš„ę ‡å‡†ē­”å„ä¾› contains åŒ¹é…ļ¼ŒåŒę—¶ä½œäøŗ LLM č£åˆ¤ēš„å‚č€ƒē­”ę”ˆć€‚", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "å°ę˜Žę—©äøŠä¹°äŗ† 4 äøŖč‹¹ęžœļ¼Œäø‹åˆåˆä¹°äŗ† 7 äøŖč‹¹ęžœļ¼Œä»–äø€å…±ęœ‰å¤šå°‘äøŖč‹¹ęžœļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š11 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "äø€č¾†ę±½č½¦ä»„ęÆå°ę—¶ 60 å…¬é‡Œēš„é€Ÿåŗ¦č”Œé©¶ 2.5 å°ę—¶ļ¼Œäø€å…±č”Œé©¶äŗ†å¤šå°‘å…¬é‡Œļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "äø€ä»¶č”£ęœåŽŸä»· 200 å…ƒļ¼ŒēŽ°åœØę‰“ 8 ęŠ˜å‡ŗå”®ļ¼ŒęŠ˜åŽä»·ę˜Æå¤šå°‘å…ƒļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 40 åå­¦ē”Ÿļ¼Œå…¶äø­ 25% ęˆ“ēœ¼é•œļ¼Œęˆ“ēœ¼é•œēš„ęœ‰å¤šå°‘äŗŗļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š10 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 ä»¶ļ¼Œå…¶äø­ 30% ę˜Æę¬”å“ļ¼Œę¬”å“ęœ‰å¤šå°‘ä»¶ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š15 ä»¶"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/quickstart/val.evalset.json b/examples/optimization/quickstart/val.evalset.json new file mode 100644 index 0000000..9408bb1 --- /dev/null +++ b/examples/optimization/quickstart/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "math_word_problems_val", + "name": "å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ - éŖŒčÆé›†", + "description": "3 é“å°å­¦ę°“å¹³ē®—ęœÆåŗ”ē”Øé¢˜ļ¼Œč¦†ē›–ä¹˜ę³•ć€å•ä½ę¢ē®—å’Œē™¾åˆ†ęÆ”ļ¼›final_response äø­åø¦ć€Œē­”ę”ˆļ¼šxxxć€ēš„ę ‡å‡†ē­”å„ä¾› contains åŒ¹é…ļ¼ŒåŒę—¶ä½œäøŗ LLM č£åˆ¤ēš„å‚č€ƒē­”ę”ˆć€‚", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "ę•™å®¤é‡Œęœ‰ 5 ęŽ’åŗ§ä½ļ¼ŒęÆęŽ’ 8 äøŖļ¼Œäø€å…±å¤šå°‘äøŖåŗ§ä½ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š40 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已矄 1 å‡ę°“é‡ 1 åƒå…‹ļ¼Œ3.5 å‡ę°“é‡å¤šå°‘åƒå…‹ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 30 äŗŗļ¼Œå…¶äø­ 60% ę˜Æå„³ē”Ÿļ¼ŒčÆ·é—®ęœ‰å¤šå°‘åå„³ē”Ÿļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š18 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "math_word_problem_optimizer", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/remote_prompt_store/README.md b/examples/optimization/remote_prompt_store/README.md new file mode 100644 index 0000000..5ae37fb --- /dev/null +++ b/examples/optimization/remote_prompt_store/README.md @@ -0,0 +1,208 @@ +# Remote Prompt Store — ęŽ„å…„čæœē«Æé…ē½®äø­åæƒåš prompt 优化 + +> **é€‚ē”Øåœŗę™Æ**:业劔 prompt äøåœØęœ¬åœ°ę–‡ä»¶ļ¼Œč€Œē”± ops é…åœØčæœē«Æé…ē½®äø­åæƒļ¼ˆäøƒå½©ēŸ³ / Apollo / Nacos / 自研 KV / ę•°ę®åŗ“ļ¼‰ļ¼ŒäøšåŠ”ęœåŠ”ä»Žäø­åæƒę‹‰å–ä½æē”Øć€‚ęœ¬ example ę¼”ē¤ŗé€ščæ‡ `TargetPrompt.add_callback` å°†ä¼˜åŒ–å™ØåÆ¹ęŽ„čæœē«ÆčÆ»å†™ęŽ„å£ļ¼Œå¹¶é€ščæ‡ production / sandbox 双 namespace éš”ē¦»ē”Ÿäŗ§ę•°ę®ć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` äøŽ `http_service/README.md`怂 + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +远端 prompt åœŗę™ÆäøŽęœ¬åœ°ę–‡ä»¶åœŗę™Æēš„å…³é”®å·®å¼‚ļ¼š + +- ä¼˜åŒ–å™Øę— ę³•ē›“ęŽ„čÆ»å†™ęœ¬åœ°ę–‡ä»¶ā€”ā€”åæ…é”»é€ščæ‡ē”Øęˆ·ęä¾›ēš„ async å‡½ę•°ę“ä½œčæœē«Æ +- ē”Ÿäŗ§ prompt é€šåøøę‰æę‹…ēŗæäøŠęµé‡ļ¼ŒęœŖē»å®”ę‰¹ēš„å†™å…„ę„å‘³ē€åˆč§„é£Žé™© +- äøåŒēŽÆå¢ƒļ¼ˆē”Ÿäŗ§ / 沙箱 / ē°åŗ¦ļ¼‰ēš„ prompt é€šåøøå·²ē»å­˜åœØ namespace éš”ē¦»ęœŗåˆ¶ + +本 example ēš„č®¾č®”åŽŸåˆ™ļ¼š + +- **ä¼˜åŒ–å™ØåŖčÆ»å†™ę²™ē®± namespace**ļ¼Œē”Ÿäŗ§ namespace å…ØēØ‹äøč¢«č§¦ē¢° +- **`update_source=False` å¼ŗåˆ¶ēŗ¦ęŸ**ļ¼šč·‘å®ŒåŽę²™ē®±č‡ŖåŠØå›žę»šåˆ° baselineļ¼Œå€™é€‰åŖč¾“å‡ŗåˆ°ęœ¬åœ° `runs//best_prompts/`ļ¼Œē”±äŗŗå·„å®”ę‰¹åŽå¦č”ŒåŒę­„åˆ°ē”Ÿäŗ§ +- **é…ē½®äø­åæƒå®žēŽ°é€ę˜Ž**ļ¼šē”Øęˆ·ęä¾›äø¤äøŖ async å‡½ę•°ļ¼ˆ`read` / `write`ļ¼‰ļ¼Œä¼˜åŒ–å™ØåÆ¹ KV åŽē«Æå½¢ę€å®Œå…Øé»‘ē›’ + +| 输兄 | 输出 | +| --- | --- | +| 一对 async å‡½ę•°ļ¼š`async read() -> str` äøŽ `async write(value: str) -> None` | 沙箱 namespace äø­ēš„ęœ€ä¼˜ prompt å€™é€‰å‰Æęœ¬ļ¼ˆruns/best_prompts/) | +| 沙箱 namespace ēš„å†™å…„ęƒé™ | ē”Ÿäŗ§ namespace äøå˜ļ¼›ę²™ē®±åœØę”¶å°¾ę—¶č‡ŖåŠØå›žę»šåˆ° baseline | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | ē®—ęœÆåŗ”ē”Øé¢˜ę±‚č§£ļ¼ˆäøŽ quickstart åŒäø€ē±»ä»»åŠ”ļ¼‰ | +| 远端 KV ęØ”ę‹Ÿ | `store/fake_kv_store.py` ē”Øęœ¬åœ° JSON ę–‡ä»¶ęŒä¹…åŒ–ēš„å­—å…ø | +| ä¼˜åŒ–ē›®ę ‡ | `system_prompt` å­—ę®µļ¼Œå­˜å‚ØäŗŽ `system_prompt:sandbox` 这个 KV key | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`(contains åŒ¹é…ļ¼‰ | +| 训练 / éŖŒčÆč§„ęØ” | 5 ę” / 3 ę” | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2,`call_agent` async čµ„ęŗēŗ¦ęŸč§ `http_service/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **TargetPrompt.add_callback(name, read=, write=)** | ę³Øå†Œäø€äøŖē”±ē”Øęˆ·å‡½ę•°é©±åŠØēš„ prompt 字段。`read` / `write` åæ…é”»ę˜Æ async 函数;`read` ę— å‚ę•°čæ”å›ž prompt ę–‡ęœ¬ļ¼Œ`write` ęŽ„ę”¶ę–°ę–‡ęœ¬å¹¶å†™å…„ć€‚ä¼˜åŒ–å™ØåœØčÆ„ęµ‹å‰č°ƒ `read`ć€äŗ§ē”Ÿę–°å€™é€‰ę—¶č°ƒ `write`怂 | +| **ē”Ÿäŗ§ / 沙箱 namespace** | é…ē½®äø­åæƒåøøč§ēš„ēŽÆå¢ƒéš”ē¦»å½¢ę€ć€‚ęœ¬ example ē”Øäø¤äøŖå›ŗå®š KV key ęØ”ę‹Ÿļ¼š`system_prompt:production`ļ¼ˆēŗæäøŠčÆ»å–ļ¼‰äøŽ `system_prompt:sandbox`ļ¼ˆä¼˜åŒ–å™Øå†™å…„ļ¼‰ć€‚ | +| **č‡ŖåŠØå›žę»š** | `update_source=False` ę—¶ä¼˜åŒ–å™ØåœØ `finally` é˜¶ę®µč°ƒē”Ø `write` ęŠŠå­—ę®µčæ˜åŽŸäøŗčæč”Œå¼€å§‹ę—¶é€ščæ‡ `read` čŽ·å–ēš„ baseline åæ«ē…§ļ¼Œéæå…ę²™ē®±č¢«é—ē•™ēš„å€™é€‰ę±”ęŸ“ć€‚ | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 启动 + +```bash +python examples/optimization/remote_prompt_store/run_optimization.py +``` + +åÆåŠØę—¶č„šęœ¬ä¼šå…ˆč°ƒ `reset_store(...)` 把 production / sandbox éƒ½åˆå§‹åŒ–äøŗ baseline怂**čæ™äø€ę­„ä»…ē”ØäŗŽę¼”ē¤ŗ**ā€”ā€”ēœŸå®žäøšåŠ”äø­ē”Ÿäŗ§ namespace 已由 ops ē»“ęŠ¤ļ¼Œę— éœ€é‡ē½®ć€‚ + +### 3.4 äŗ§ē‰©ē»“ęž„ + +``` +runs// +ā”œā”€ā”€ result.json +ā”œā”€ā”€ summary.txt +ā”œā”€ā”€ baseline_prompts/ čæč”Œå‰ä»Ž KV čÆ»å–ēš„ baseline 快照 +ā”œā”€ā”€ best_prompts/ val é›†å¾—åˆ†ęœ€é«˜ēš„å€™é€‰ļ¼ˆå¾…äŗŗå·„å®”ę‰¹ļ¼‰ +└── rounds/ + +store/store.json KV ęŒä¹…åŒ–ę–‡ä»¶ļ¼ˆę¼”ē¤ŗē”Øļ¼‰ + 收尾时 sandbox key å·²č¢«å›žę»šåˆ° baseline + production key å…ØēØ‹ęœŖå˜ +``` + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +[é…ē½®äø­åæƒ KV] + ā”œā”€ā”€ "system_prompt:production" ← ēŗæäøŠęœåŠ”čÆ»čæ™é‡Œļ¼ˆę°øčæœäøč¢«ä¼˜åŒ–å™Øč§¦ē¢°ļ¼‰ + └── "system_prompt:sandbox" ← ä¼˜åŒ–å™ØčÆ» / å†™čæ™é‡Œ + +[run_optimization.py] + │ + ā”œā”€ā”€ reset_store(BASELINE_PROMPT) ę¼”ē¤ŗå‰ē½®ļ¼šproduction = sandbox = baseline + │ ļ¼ˆēœŸå®žäøšåŠ”č·³čæ‡ę­¤ę­„ļ¼‰ + │ + ā”œā”€ā”€ TargetPrompt.add_callback( + │ "system_prompt", + │ read=read_sandbox_prompt, async () -> str 读 sandbox key + │ write=write_sandbox_prompt, async (str) -> None 写 sandbox key + │ ) + │ + ā”œā”€ā”€ call_agent(query): + │ prompt_text = await read_sandbox_prompt() # ēŽ°čÆ»ēŽ°ē”Ø + │ agent = create_agent(prompt_text) # å³ę—¶ęž„é€  + │ return await runner.run_async(...) # č·‘äø€ę¬”ęŽØē† + │ + └── AgentOptimizer.optimize(update_source=False, ...) + ā”œā”€ā”€ ęÆč½®ęŠŠå€™é€‰ prompt 写兄 sandbox key + ā”œā”€ā”€ ę”¶å°¾ļ¼šsandbox key č‡ŖåŠØå›žę»šåˆ° baseline 快照 + └── best_prompts/ č½ęœ¬åœ°ļ¼Œå¾…äŗŗå·„å®”ę‰¹ +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | ä¼˜åŒ–å™Øå…„å£ļ¼Œę³Øå†Œ callback | `reset_store(...)` 改为 ops ēœŸå®žåˆå§‹åŒ–ļ¼ˆęˆ–ē›“ęŽ„åˆ é™¤ļ¼‰ļ¼›å…¶ä½™åŸŗęœ¬äøå˜ | +| `agent/agent.py` | LlmAgent å·„åŽ‚ļ¼Œprompt é€ščæ‡å‚ę•°ę³Øå…„ | ę›æę¢äøŗäøšåŠ” agent ęž„å»ŗé€»č¾‘ | +| `store/prompt_client.py` | async `read` / `write` å‡½ę•°å®šä¹‰ | **ę øåæƒę”¹é€ ē‚¹**ļ¼šęŠŠå†…éƒØå®žēŽ°ę›æę¢äøŗäøšåŠ”é…ē½®äø­åæƒ SDK č°ƒē”Øļ¼Œå‡½ę•°ē­¾åäæęŒäøå˜ | +| `store/fake_kv_store.py` | 本地 JSON ę–‡ä»¶ęØ”ę‹Ÿ KV | ēœŸå®žäøšåŠ”äø‹ę•“ä½“åˆ é™¤ | +| `optimizer.json` | 算法 + metric é…ē½® | äøŽ quickstart 一臓 | +| `train.evalset.json` / `val.evalset.json` | ę•°ę®é›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ | + +### 4.2 äøŽ `http_service/` ēš„åÆ¹ē…§ + +å”Æäø€å·®å¼‚åœØ `TargetPrompt` ēš„ę³Øå†Œę–¹å¼ļ¼š + +```python +# http_service:prompt åœØęœ¬åœ°ę–‡ä»¶ +target = TargetPrompt().add_path("system_prompt", "service/prompts/system.md") + +# remote_prompt_store:prompt 在远端 KV +target = TargetPrompt().add_callback( + "system_prompt", + read=read_sandbox_prompt, + write=write_sandbox_prompt, +) +``` + +`optimizer.json`态`call_agent` ēš„ę•“ä½“ē»“ęž„ć€metric å®šä¹‰ć€äŗ§ē‰© layout å‡äæęŒäø€č‡“ć€‚ + +## 5 Ā· å…³é”®é…ē½® + +### 5.1 `update_source` ēš„å¼ŗåˆ¶ēŗ¦ęŸ + +远端 prompt åœŗę™Æäø‹**å¼ŗēƒˆå»ŗč®®å§‹ē»ˆäæęŒ `update_source=False`**ć€‚ē†ē”±ļ¼š + +- čæœē«Æé…ē½®é€šåøøę‰æę‹…ēŗæäøŠęµé‡ļ¼Œč‡ŖåŠØå†™å›žę„å‘³ē€ęœŖå®”ę‰¹å˜ę›“ē›“ęŽ„čæ›ē”Ÿäŗ§ +- å³ä¾æę²™ē®± namespace ä¹Ÿęœ‰č”č°ƒ / ē°åŗ¦ē­‰éšå¼ēŗ¦ęŸļ¼Œåŗ”éæå…č®©ę”†ęž¶ę›æäøšåŠ”åš"ęäŗ¤"决策 +- `update_source=False` ę—¶ä¼˜åŒ–å™Øę”¶å°¾ä¼šęŠŠę²™ē®±å›žę»šåˆ° baselineļ¼Œå”Æäø€é—ē•™ē‰©ę˜Æęœ¬åœ° `best_prompts/`ļ¼Œē”±äŗŗå·„ęˆ–å®”ę‰¹å·„å…·å†³å®šåŽē»­åŠØä½œ + +### 5.2 `read` / `write` ēš„å®žēŽ°ēŗ¦ęŸ + +| ēŗ¦ęŸ | čÆ“ę˜Ž | +| --- | --- | +| ē­¾ååæ…é”»ę˜Æ async | `read: async () -> str`ļ¼›`write: async (str) -> None` | +| `read` 异常处理 | ä¼˜åŒ–å™ØåÆåŠØęœŸä¼šč°ƒäø€ę¬” `read` čŽ·å– baseline åæ«ē…§ć€‚čÆ„ę¬”č°ƒē”ØęŠ›é”™ä¼šč®© `optimize()` ē›“ęŽ„ fail-fastļ¼Œå¼‚åøøé€ä¼ ē»™č°ƒē”Øę–¹ć€‚čæč”Œäø­ `read` ęŠ›é”™ä¼šåÆ¼č‡“å½“å‰ case 评测失蓄 | +| `write` 幂等性 | ä¼˜åŒ–å™Øę”¶å°¾ę—¶ä¼šå†ę¬”č°ƒ `write` ęŠŠę²™ē®±å›žę»šåˆ° baseline;苄 `write` äøå¹‚ē­‰ęˆ–ę— äŗ‹åŠ”äæęŠ¤ļ¼Œå›žę»šåÆčƒ½å¤±č“„ć€‚å»ŗč®®å®žēŽ°ę”ÆęŒé‡å¤č°ƒē”ØåŒäø€ value | +| é‡čÆ• | é…ē½®äø­åæƒ SDK é€šåøøęœ‰å†…ē½®é‡čÆ•ļ¼›ęœ¬ example ēš„ `read` / `write` äøé¢å¤–å°č£…é‡čÆ•ļ¼ŒäøšåŠ”ę–¹ęŒ‰éœ€č‡Ŗč”ŒåŠ äøŠ | + +## 6 Ā· ęŽ„å…„ēœŸå®žé…ē½®äø­åæƒ + +将 `store/prompt_client.py` å†…éƒØå®žēŽ°ę›æę¢äøŗäøšåŠ” SDK č°ƒē”Øļ¼Œ**äæęŒå‡½ę•°ē­¾åäøå˜**: + +```python +# store/prompt_client.py ę›æę¢ē¤ŗä¾‹ +async def read_sandbox_prompt() -> str: + return await your_config_sdk.get( + namespace="sandbox", + key="system_prompt", + ) + +async def write_sandbox_prompt(value: str) -> None: + await your_config_sdk.put( + namespace="sandbox", + key="system_prompt", + value=value, + ) +``` + +`run_optimization.py` äø­ `TargetPrompt.add_callback(...)` č°ƒē”ØäøŽå…¶ä»–é…ē½®ę— éœ€äæ®ę”¹ć€‚ + +`fake_kv_store.py` åœØēœŸå®žęŽ„å…„åŽåÆę•“ä½“åˆ é™¤ć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Qļ¼šäøšåŠ”ęœåŠ”åœØå¦äø€äøŖčæ›ēØ‹ļ¼Œä¼˜åŒ–å™Øå†™å…„ę²™ē®±åŽęœåŠ”čƒ½ę„ŸēŸ„å—ļ¼Ÿ** +Aļ¼šå–å†³äŗŽäøšåŠ”ęœåŠ”ēš„ prompt åŠ č½½ē­–ē•„ć€‚**äøšåŠ”ęœåŠ”åæ…é”»åœØęÆę¬”čÆ·ę±‚ę—¶é‡ę–°ę‹‰é…ē½®**ļ¼ˆå³"热加载"ļ¼‰ļ¼Œå¦åˆ™ä¼˜åŒ–å™Øēš„å†™å…„åÆ¹ęœåŠ”äøåÆč§ć€åę€å¾ŖēŽÆå¤±ę•ˆć€‚čæ™ę˜ÆäøŽ `http_service/` example å®Œå…Øē›øåŒēš„ēŗ¦ęŸļ¼ŒåŖę˜Æä»‹č“Øä»Žęœ¬åœ°ę–‡ä»¶ę¢ęˆäŗ†čæœē«Æ KV怂 + +**Q:`reset_store(BASELINE_PROMPT)` åœØē”Ÿäŗ§ēŽÆå¢ƒä¹Ÿč¦č°ƒå—ļ¼Ÿ** +Aļ¼šäøč¦ć€‚čÆ„č°ƒē”Øä»…ē”ØäŗŽę¼”ē¤ŗé¦–ę¬”ęŽ„å…„ę—¶ęŠŠ KV åˆå§‹åŒ–åˆ°å·²ēŸ„ēŠ¶ę€ć€‚ēœŸå®žäøšåŠ”ēš„ē”Ÿäŗ§ namespace 已由 ops ē»“ęŠ¤ļ¼Œä¼˜åŒ–å™Ø**åŖå…³åæƒčÆ» / 写沙箱**怂 + +**Q:`read` äø€ę¬”čæ”å›žēš„å†…å®¹ä¼šč¢«ē¼“å­˜å—ļ¼Ÿ** +Aļ¼šäøä¼šć€‚ä¼˜åŒ–å™ØåœØęÆę¬”čÆ„ęµ‹å€™é€‰å‰éƒ½é‡ę–°č°ƒ `read`ļ¼Œå› ę­¤ę²™ē®±č¢«å†™å…„ę–°å€¼åŽäø‹äø€ę¬” `call_agent` ē«‹å³ē”Ÿę•ˆć€‚ + +**Qļ¼šč·‘å®ŒåŽå¦‚ä½•åŒę­„å€™é€‰åˆ°ē”Ÿäŗ§ļ¼Ÿ** +A:本 example ēš„äŗ§ē‰© `best_prompts/system_prompt.md` äøŗäŗŗå·„å®”ę‰¹čµ·ē‚¹ć€‚å»ŗč®®ēš„å·„ä½œęµļ¼šäŗŗå·„ review → é€ščæ‡å®”ę‰¹å·„å…·č°ƒē”ØäøšåŠ”č‡Ŗęœ‰ SDK ęŠŠå€™é€‰å†™å…„ production namespaceļ¼ˆäøé€ščæ‡ęœ¬ę”†ęž¶ļ¼‰ć€‚ + +**Qļ¼ščƒ½å¦ä¼˜åŒ–å¤šäøŖčæœē«Æå­—ę®µļ¼Ÿ** +Aļ¼šåÆä»„ć€‚`TargetPrompt` ę”ÆęŒå¤šę¬” `add_callback`ļ¼ŒęÆę¬”ę³Øå†Œäø€ē»„ē‹¬ē«‹ēš„ `read` / `write`ć€‚å¤šå­—ę®µč”åˆä¼˜åŒ–ēš„ē®—ę³•å±‚é…ē½®å‚č§ `multi_agent_pipeline/` example怂 + +## 8 Ā· ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ēš„ę­„éŖ¤ + +1. **ę›æę¢ `store/prompt_client.py`**ļ¼šå®žēŽ° `read_sandbox_prompt` / `write_sandbox_prompt` č°ƒē”ØäøšåŠ”é…ē½®äø­åæƒ SDK +2. **删除 `reset_store(...)` č°ƒē”Ø** ęˆ–ę”¹äøŗäøšåŠ”ēœŸå®žåˆå§‹åŒ–é€»č¾‘ +3. **修改 `agent/agent.py`**ļ¼šåÆ¹ęŽ„äøšåŠ”ęØ”åž‹ / tools / output schema +4. **ę›æę¢ę•°ę®é›†**:`train.evalset.json` / `val.evalset.json` +5. **äæęŒ `update_source=False`**ļ¼šåˆč§„ēŗ¦ęŸ +6. **运蔌**ļ¼šč§‚åÆŸ `summary.txt` äøŽ `result.json`ļ¼›ęœ€ä¼˜å€™é€‰ä½äŗŽ `runs//best_prompts/`ļ¼Œē”±äŗŗå·„å®”ę‰¹åŽé€ščæ‡äøšåŠ”č‡Ŗęœ‰ęµēØ‹åŒę­„åˆ°ē”Ÿäŗ§ diff --git a/examples/optimization/remote_prompt_store/agent/__init__.py b/examples/optimization/remote_prompt_store/agent/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/remote_prompt_store/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/remote_prompt_store/agent/agent.py b/examples/optimization/remote_prompt_store/agent/agent.py new file mode 100644 index 0000000..d651214 --- /dev/null +++ b/examples/optimization/remote_prompt_store/agent/agent.py @@ -0,0 +1,48 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ę•°å­¦é¢˜ę±‚č§£ agent —— Remote Prompt Store example 专用。 + +äøŽ quickstart / http_service ēš„å…³é”®å·®å¼‚ +--------------------------------------- +本 agent **äøčÆ» prompt ꖇ件**——prompt é€ščæ‡ create_agent(prompt_text) ēš„ +å…„å‚ä¼ å…„ć€‚call_agent åœØęÆę¬”č°ƒē”Øę—¶å…ˆä»Žčæœē«Æ KV ꋉ꜀ꖰ promptļ¼Œå†ē”Øå®ƒ +Ꞅ造 agent å®žä¾‹ć€‚ + +čæ™ē§"prompt é€ščæ‡å‚ę•°ę³Øå…„"ēš„å½¢ę€ę˜Æčæœē«Æ KV åœŗę™Æēš„č‡Ŗē„¶å†™ę³•ļ¼šäøšåŠ”ęœåŠ” +åœØęÆę¬”čÆ·ę±‚ę—¶ä»Žé…ē½®äø­åæƒę‹‰ promptļ¼Œå†åˆ›å»ŗ agentļ¼Œäøä¾čµ–ä»»ä½•ęœ¬åœ°ę–‡ä»¶ć€‚ +""" + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +def _create_model() -> LLMModel: + """ęž„å»ŗ OpenAI 兼容 chat ęØ”åž‹å®žä¾‹ć€‚å‡­ę®ä»ŽēŽÆå¢ƒå˜é‡čÆ»å–ć€‚""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def create_agent(prompt_text: str) -> LlmAgent: + """ē”Øē»™å®š prompt ę–‡ęœ¬ęž„é€ äø€äøŖ LlmAgent å®žä¾‹ć€‚ + + å‚ę•° prompt_text ē”±č°ƒē”Øę–¹ļ¼ˆcall_agentļ¼‰ä»Žčæœē«Æ KV ēŽ°čÆ»ēŽ°ä¼ ļ¼Œ + ę‰€ä»„ä¼˜åŒ–å™ØęŠŠå€™é€‰å†™å…„ KV åŽäø‹äø€ę¬”č°ƒē”Øē«‹å³ē”Ÿę•ˆć€‚ + """ + return LlmAgent( + name="math_word_problem_agent", + description="Math word-problem solver whose prompt lives in a remote KV store.", + model=_create_model(), + instruction=prompt_text, + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=2048, + ), + ) diff --git a/examples/optimization/remote_prompt_store/agent/config.py b/examples/optimization/remote_prompt_store/agent/config.py new file mode 100644 index 0000000..d0a64b1 --- /dev/null +++ b/examples/optimization/remote_prompt_store/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpoint + TRPC_AGENT_MODEL_NAME ęØ”åž‹å + +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/remote_prompt_store/optimizer.json b/examples/optimization/remote_prompt_store/optimizer.json new file mode 100644 index 0000000..03f74a5 --- /dev/null +++ b/examples/optimization/remote_prompt_store/optimizer.json @@ -0,0 +1,45 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "contains", + "case_insensitive": true + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": { + "required_metrics": "all" + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { + "max_tokens": 4096, + "temperature": 0.6 + } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 40, + "score_threshold": 1.0, + "max_iterations_without_improvement": 5 + } + } +} diff --git a/examples/optimization/remote_prompt_store/run_optimization.py b/examples/optimization/remote_prompt_store/run_optimization.py new file mode 100644 index 0000000..bf770db --- /dev/null +++ b/examples/optimization/remote_prompt_store/run_optimization.py @@ -0,0 +1,161 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Remote Prompt Store example ēš„ä¼˜åŒ–å™Øå…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +业劔 prompt äøåœØęœ¬åœ°ę–‡ä»¶ļ¼Œč€Œē”± ops é…åœØčæœē«Æé…ē½®äø­åæƒļ¼ˆäøƒå½©ēŸ³ / Apollo / +Nacos / 自研 KVļ¼‰ć€‚ęœ¬č„šęœ¬ę¼”ē¤ŗé€ščæ‡ TargetPrompt.add_callback ęŽ„å…„ē”Øęˆ·ęä¾› +ēš„ async read / write å‡½ę•°čÆ»å†™čæœē«Æļ¼Œå¹¶é€ščæ‡ production / sandbox 双 +namespace éš”ē¦»ē”Ÿäŗ§ę•°ę®ć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ļ¼ˆę¼”ē¤ŗē”Øļ¼‰reset_store 把 production + sandbox éƒ½åˆå§‹åŒ–äøŗ baseline +2. ę³Øå†Œ add_callbackļ¼šä¼˜åŒ–å™Øé€ščæ‡ read_sandbox_prompt / write_sandbox_prompt + å¼‚ę­„å‡½ę•°äøŽę²™ē®± namespace 交互 +3. 定义 call_agentļ¼šęÆę¬”č°ƒē”Øå…ˆä»Ž KV ꋉ꜀ꖰ prompt å†ęž„é€  agent +4. 调 AgentOptimizer.optimize č·‘ GEPA åę€å¾ŖēŽÆ +5. ę”¶å°¾ę—¶ę‰“å°ē”Ÿäŗ§ / 沙箱 namespace ēš„ēŠ¶ę€å˜åŒ– + +ę€Žä¹ˆč·‘ +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/remote_prompt_store/run_optimization.py +3) ēœ‹ runs/<ę—¶é—“ęˆ³>/best_prompts/system_prompt.mdļ¼ˆå¾…äŗŗå·„å®”ę‰¹ļ¼‰ + +ęŽ„å…„č‡Ŗęœ‰é…ē½®äø­åæƒę—¶ę”¹å“Ŗé‡Œ +------------------------ +- 删除 reset_store(...) č°ƒē”Øļ¼ˆēœŸå®žäøšåŠ”äø‹ē”Ÿäŗ§ namespace 已由 ops ē»“ęŠ¤ļ¼‰ +- ę›æę¢ store/prompt_client.py äø­ read/write å‡½ę•°ēš„å†…éƒØå®žēŽ°äøŗäøšåŠ” SDK č°ƒē”Ø +- update_source=False äø„ę ¼äæęŒļ¼ˆé˜²ē”Ÿäŗ§č¢«ęœŖå®”ę‰¹å˜ę›“č¦†ē›–ļ¼‰ +- č·‘å®ŒåŽē”±äŗŗå·„å®”ę‰¹å·„å…·ęŠŠ best_prompts/ åŒę­„åˆ°ē”Ÿäŗ§ +""" + +from __future__ import annotations + +import asyncio +import sys +import uuid +from datetime import datetime +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + +from agent.agent import create_agent +from store.prompt_client import ( + PROMPT_KEY_PRODUCTION, + read_production_prompt, + read_sandbox_prompt, + reset_store, + write_sandbox_prompt, +) + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +APP_NAME = "remote_prompt_store_demo_agent" + +# 演示用 baselineć€‚ēœŸå®žäøšåŠ”é‡Œčæ™äø€ę­„åÆ¹åŗ”"ops å·²ē»åœØē”Ÿäŗ§ KV 配儽 prompt"怂 +BASELINE_PROMPT = ( + "ä½ ę˜Æäø€äøŖå‹å„½ēš„čŠå¤©åŠ©ę‰‹ļ¼Œå–œę¬¢å’Œē”Øęˆ·åˆ†äŗ«ęƒ³ę³•ć€‚å›žē­”ē”Øęˆ·é—®é¢˜ę—¶ļ¼Œ" + "čÆ·å°½é‡ē”Øē”ŸåŠØć€åÆŒęœ‰äŗŗęƒ…å‘³ēš„čÆ­čØ€ļ¼Œč®©ē”Øęˆ·ę„Ÿč§‰åƒę˜ÆåœØå’Œęœ‹å‹čŠå¤©ć€‚\n" +) + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šä»Žę²™ē®± KV ꋉ꜀ꖰ prompt → Ꞅ造 agent → č·‘äø€ę¬”ęŽØē†ć€‚ + + ęÆę¬”č°ƒē”Øéƒ½é‡čÆ» KVļ¼ŒäæčÆä¼˜åŒ–å™Øå†™å…„ę–°å€™é€‰åŽē«‹å³ē”Ÿę•ˆć€‚ęÆę¬”ę–°å»ŗ + Runner + InMemorySessionService ē»™ęÆäøŖ case ē‹¬ē«‹ēš„ session state, + å¹¶å‘čÆ„ęµ‹ę—¶äøäŗ’ē›øę±”ęŸ“ć€‚ + """ + prompt_text = await read_sandbox_prompt() + agent = create_agent(prompt_text) + + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={}, + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content, + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return final_text.strip() + + +async def main() -> None: + """组装 TargetPrompt(add_callback)+ 调 AgentOptimizer.optimize怂""" + # ę¼”ē¤ŗå‰ē½®ļ¼šęŠŠ KV é‡ē½®åˆ°"ops åˆšé…å„½ē”Ÿäŗ§ prompt + åŒę­„åˆ°ę²™ē®±"ēš„åˆå§‹ę€ć€‚ + # ēœŸå®žäøšåŠ”äø‹äøéœ€č¦čæ™äø€ę­„ā€”ā€”äøšåŠ”ę–¹ēš„ē”Ÿäŗ§ KV å·²ē»ęœ‰ prompt怂 + reset_store(BASELINE_PROMPT) + + # 用 add_callback 而非 add_pathļ¼šä¼˜åŒ–å™Øé€ščæ‡äø¤äøŖå¼‚ę­„å‡½ę•°äøŽę²™ē®±äŗ¤äŗ’ļ¼Œ + # KV åŽē«Æå½¢ę€åÆ¹ä¼˜åŒ–å™Øå®Œå…Øé»‘ē›’ć€‚ + target = TargetPrompt().add_callback( + "system_prompt", + read=read_sandbox_prompt, + write=write_sandbox_prompt, + ) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + result = await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + # 远端 prompt åœŗę™Æäø‹äø„ę ¼äæęŒ Falseļ¼šč·‘å®Œč‡ŖåŠØęŠŠę²™ē®±å›žę»šåˆ° baseline, + # ē”Ÿäŗ§ namespace ę°øčæœäøč¢«č§¦ē¢°ć€‚ęœ€ä½³å€™é€‰å†™åˆ° output_dir/best_prompts/, + # ē”±äŗŗå·„å®”ę‰¹åŽé€ščæ‡å•ē‹¬č„šęœ¬ / å·„å•ęµēØ‹åŒę­„åˆ°ē”Ÿäŗ§ć€‚ + update_source=False, + verbose=1, + ) + + # 演示"å®”ę‰¹åŽåŒę­„"å·„ä½œęµļ¼šå®žé™…ē”Ÿäŗ§äø­äø‹ę–¹é€»č¾‘ē”±ē‹¬ē«‹å®”ę‰¹å·„å…·č§¦å‘ć€‚ + print("\n=== ä¼˜åŒ–å·²å®Œęˆ ===") + print(f"baseline → best : {result.baseline_pass_rate:.4f} → {result.best_pass_rate:.4f}") + production_text = await read_production_prompt() + sandbox_text = await read_sandbox_prompt() + print(f"\n[KV] production ({PROMPT_KEY_PRODUCTION}) 内容长度: {len(production_text)} 字 (ęœŖå˜)") + print(f"[KV] sandbox å·²č‡ŖåŠØå›žę»šåˆ° baselineļ¼Œé•æåŗ¦: {len(sandbox_text)} 字") + print(f"\n请在 {output_dir}/best_prompts/system_prompt.md ęŸ„ēœ‹ęœ€ä½³å€™é€‰ļ¼›") + print("äŗŗå·„å®”ę‰¹é€ščæ‡åŽļ¼Œå†č°ƒē”Ø store.prompt_client äø­ēš„å·„å…·åŒę­„åˆ°ē”Ÿäŗ§ć€‚") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/remote_prompt_store/store/__init__.py b/examples/optimization/remote_prompt_store/store/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/remote_prompt_store/store/fake_kv_store.py b/examples/optimization/remote_prompt_store/store/fake_kv_store.py new file mode 100644 index 0000000..f694420 --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/fake_kv_store.py @@ -0,0 +1,53 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ę¼”ē¤ŗē”Øļ¼šęœ¬åœ° JSON ę–‡ä»¶ęØ”ę‹Ÿčæœē«Æé…ē½®äø­åæƒć€‚ + +ēœŸå®žäøšåŠ”äø‹čæ™äø€å±‚ę¢ęˆäøƒå½©ēŸ³ / Apollo / Nacos / 自研 KV ēš„ SDK å³åÆļ¼Œ +åÆ¹å¤–ęš“éœ²ēš„ read(key) / write(key, value) 同歄 API äæęŒäøå˜ć€‚ + +ę•°ę® schema +----------- +store.json ę˜Æäø€äøŖ {key: value} å­—å…øļ¼Œęœ¬ example ē”Øäø¤äøŖå›ŗå®š key: + "system_prompt:production" ē”Ÿäŗ§ namespace ēš„ prompt + "system_prompt:sandbox" 沙箱 namespace ēš„ promptļ¼ˆä¼˜åŒ–å™ØčÆ»å†™ļ¼‰ +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +class FakeKVStore: + """JSON ę–‡ä»¶ęŒä¹…åŒ–ēš„ KVļ¼Œē®€åŒ–ē‰ˆēš„čæœē«Æé…ē½®äø­åæƒć€‚""" + + def __init__(self, path: Path) -> None: + self._path = path + if not self._path.exists(): + self._path.write_text("{}", encoding="utf-8") + + def _load(self) -> dict[str, Any]: + return json.loads(self._path.read_text(encoding="utf-8")) + + def _save(self, data: dict[str, Any]) -> None: + self._path.write_text( + json.dumps(data, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + def read(self, key: str) -> str: + """读 KVļ¼›ē¼ŗå¤±ę—¶ęŠ› KeyErrorļ¼ŒęØ”ę‹Ÿčæœē«Æ"é…ē½®äøå­˜åœØ"怂""" + data = self._load() + if key not in data: + raise KeyError(f"prompt key not found in store: {key}") + return str(data[key]) + + def write(self, key: str, value: str) -> None: + """č¦†ē›–å¼å†™å…„ć€‚""" + data = self._load() + data[key] = value + self._save(data) diff --git a/examples/optimization/remote_prompt_store/store/prompt_client.py b/examples/optimization/remote_prompt_store/store/prompt_client.py new file mode 100644 index 0000000..2291cec --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/prompt_client.py @@ -0,0 +1,85 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""prompt KV ēš„ async 访问层 —— TargetPrompt.add_callback ęœŸęœ›ēš„å®žēŽ°ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +ęœ¬ę–‡ä»¶ę˜Æ add_callback ęŽ„å…„čæœē«Æé…ē½®äø­åæƒēš„**ę øåæƒęØ”ęæ**ć€‚å‡½ę•°ē­¾åäø„ę ¼åŒ¹é… +add_callback ēš„åč®®ļ¼šread 是 async () -> str,write 是 async (str) -> None怂 + +namespace éš”ē¦»č®¾č®” +------------------ +- productionļ¼šäøšåŠ”ēŗæäøŠčÆ»å–ēš„ prompt,**ę°øčæœäøč¢«ä¼˜åŒ–å™Øå†™å…„** +- sandboxļ¼šä¼˜åŒ–å™ØčÆ» / å†™ēš„å·„ä½œ namespaceļ¼›update_source=False ę—¶ä¼˜åŒ–å™Ø + åœØę”¶å°¾é˜¶ę®µęŠŠ sandbox č‡ŖåŠØå›žę»šåˆ° baseline 快照 + +ęŽ„å…„č‡Ŗęœ‰é…ē½®äø­åæƒę—¶ę”¹å“Ŗé‡Œ +------------------------ +äæęŒå››äøŖå…¬å¼€ async å‡½ę•°ēš„ē­¾åäøå˜ļ¼ŒęŠŠå†…éƒØå®žēŽ°ä»Ž FakeKVStore ę›æę¢äøŗ +äøšåŠ”ēœŸå®ž SDK č°ƒē”Øļ¼š + + async def read_sandbox_prompt() -> str: + return await your_config_sdk.get(namespace="sandbox", key="system_prompt") + + async def write_sandbox_prompt(value: str) -> None: + await your_config_sdk.put(namespace="sandbox", key="system_prompt", value=value) + +run_optimization.py äø­ add_callback č°ƒē”Øę— éœ€äæ®ę”¹ć€‚ +""" + +from __future__ import annotations + +import asyncio +from pathlib import Path + +from .fake_kv_store import FakeKVStore + + +# ę¼”ē¤ŗē”Øļ¼šęœ¬åœ° JSON ę–‡ä»¶ęŒä¹…åŒ–ēš„ KVć€‚ēœŸå®žäøšåŠ”é‡Œčæ™äø€å±‚ę¢ęˆé…ē½®äø­åæƒ SDK +# ēš„å…Øå±€ clientļ¼ˆå¦‚ _CFG_CLIENT = your_sdk.Client(...)ļ¼‰ļ¼Œäøå†ä¾čµ–ęœ¬ę–‡ä»¶ć€‚ +_STORE_PATH = Path(__file__).resolve().parent / "store.json" +_KV = FakeKVStore(_STORE_PATH) + +PROMPT_KEY_PRODUCTION = "system_prompt:production" +PROMPT_KEY_SANDBOX = "system_prompt:sandbox" + + +async def read_sandbox_prompt() -> str: + """ä»Žę²™ē®± namespace 读 promptā€”ā€”ä¼˜åŒ–å™ØčÆ„ęµ‹å€™é€‰ę—¶č°ƒē”Øć€‚ + + add_callback ęœŸęœ›ę­¤å‡½ę•°ę— å‚ę•°čæ”å›žå½“å‰ prompt ę–‡ęœ¬ć€‚ + """ + # ēœŸå®žåœŗę™Æäø‹čµ°ē½‘ē»œčÆ·ę±‚ļ¼›čæ™é‡Œ await asyncio.sleep(0) ęØ”ę‹Ÿäø€ę¬” await + # åˆ‡ē‚¹ļ¼Œč®©åēØ‹åœØ KV č°ƒē”Øå¤„åÆč¢«č°ƒåŗ¦ć€‚ + await asyncio.sleep(0) + return _KV.read(PROMPT_KEY_SANDBOX) + + +async def write_sandbox_prompt(value: str) -> None: + """写兄沙箱 namespaceā€”ā€”ä¼˜åŒ–å™Øč½å€™é€‰ / ę”¶å°¾å›žę»š baseline éƒ½čµ°čæ™é‡Œć€‚ + + add_callback ęœŸęœ›ę­¤å‡½ę•°ęŽ„å—ę–° prompt ę–‡ęœ¬ļ¼Œę— čæ”å›žå€¼ć€‚ + å®žēŽ°éœ€äæčÆå¹‚ē­‰ę€§ļ¼šä¼˜åŒ–å™Øę”¶å°¾ę—¶ä¼šå†ę¬”č°ƒęœ¬å‡½ę•°ęŠŠ sandbox 写回 baseline, + äøå¹‚ē­‰ēš„å†™å…„åÆčƒ½åÆ¼č‡“å›žę»šå¤±č“„ć€‚ + """ + await asyncio.sleep(0) + _KV.write(PROMPT_KEY_SANDBOX, value) + + +async def read_production_prompt() -> str: + """čÆ»ē”Ÿäŗ§ namespace ēš„ promptā€”ā€”é¦–ę¬”ęŽ„å…„ę—¶ē”Øå®ƒåˆå§‹åŒ–ę²™ē®±ć€‚""" + await asyncio.sleep(0) + return _KV.read(PROMPT_KEY_PRODUCTION) + + +def reset_store(production_prompt: str) -> None: + """ę¼”ē¤ŗē”Øļ¼šęŠŠ KV 初始化到 production / sandbox éƒ½äøŗē»™å®š prompt ēš„ēŠ¶ę€ć€‚ + + ēœŸå®žäøšåŠ”äø‹äøåŗ”č°ƒęœ¬å‡½ę•°ā€”ā€”äøšåŠ”ēš„ē”Ÿäŗ§ namespace ē”± ops 结护, + ä¼˜åŒ–å™ØåŖå…³åæƒčÆ» / 写沙箱。 + """ + _KV.write(PROMPT_KEY_PRODUCTION, production_prompt) + _KV.write(PROMPT_KEY_SANDBOX, production_prompt) diff --git a/examples/optimization/remote_prompt_store/store/store.json b/examples/optimization/remote_prompt_store/store/store.json new file mode 100644 index 0000000..3799863 --- /dev/null +++ b/examples/optimization/remote_prompt_store/store/store.json @@ -0,0 +1,4 @@ +{ + "system_prompt:production": "ä½ ę˜Æäø€äøŖå‹å„½ēš„čŠå¤©åŠ©ę‰‹ļ¼Œå–œę¬¢å’Œē”Øęˆ·åˆ†äŗ«ęƒ³ę³•ć€‚å›žē­”ē”Øęˆ·é—®é¢˜ę—¶ļ¼ŒčÆ·å°½é‡ē”Øē”ŸåŠØć€åÆŒęœ‰äŗŗęƒ…å‘³ēš„čÆ­čØ€ļ¼Œč®©ē”Øęˆ·ę„Ÿč§‰åƒę˜ÆåœØå’Œęœ‹å‹čŠå¤©ć€‚\n", + "system_prompt:sandbox": "ä½ ę˜Æäø€äøŖå‹å„½ēš„čŠå¤©åŠ©ę‰‹ļ¼Œå–œę¬¢å’Œē”Øęˆ·åˆ†äŗ«ęƒ³ę³•ć€‚å›žē­”ē”Øęˆ·é—®é¢˜ę—¶ļ¼ŒčÆ·å°½é‡ē”Øē”ŸåŠØć€åÆŒęœ‰äŗŗęƒ…å‘³ēš„čÆ­čØ€ļ¼Œč®©ē”Øęˆ·ę„Ÿč§‰åƒę˜ÆåœØå’Œęœ‹å‹čŠå¤©ć€‚\n" +} \ No newline at end of file diff --git a/examples/optimization/remote_prompt_store/train.evalset.json b/examples/optimization/remote_prompt_store/train.evalset.json new file mode 100644 index 0000000..1d2cb46 --- /dev/null +++ b/examples/optimization/remote_prompt_store/train.evalset.json @@ -0,0 +1,112 @@ +{ + "eval_set_id": "remote_prompt_store_train", + "name": "Remote prompt store demo - train", + "description": "5 é“å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ļ¼›agent ēš„ prompt é€ščæ‡čæœē«Æ KV callback 读写。", + "eval_cases": [ + { + "eval_id": "wp_apples_add", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [{"text": "å°ę˜Žę—©äøŠä¹°äŗ† 4 äøŖč‹¹ęžœļ¼Œäø‹åˆåˆä¹°äŗ† 7 äøŖč‹¹ęžœļ¼Œä»–äø€å…±ęœ‰å¤šå°‘äøŖč‹¹ęžœļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š11 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_car_distance", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [{"text": "äø€č¾†ę±½č½¦ä»„ęÆå°ę—¶ 60 å…¬é‡Œēš„é€Ÿåŗ¦č”Œé©¶ 2.5 å°ę—¶ļ¼Œäø€å…±č”Œé©¶äŗ†å¤šå°‘å…¬é‡Œļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š150 公里"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_discount_price", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [{"text": "äø€ä»¶č”£ęœåŽŸä»· 200 å…ƒļ¼ŒēŽ°åœØę‰“ 8 ęŠ˜å‡ŗå”®ļ¼ŒęŠ˜åŽä»·ę˜Æå¤šå°‘å…ƒļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š160 元"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_glasses_percent", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 40 åå­¦ē”Ÿļ¼Œå…¶äø­ 25% ęˆ“ēœ¼é•œļ¼Œęˆ“ēœ¼é•œēš„ęœ‰å¤šå°‘äŗŗļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š10 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "wp_defect_items_percent", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [{"text": "一批商品共 50 ä»¶ļ¼Œå…¶äø­ 30% ę˜Æę¬”å“ļ¼Œę¬”å“ęœ‰å¤šå°‘ä»¶ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š15 ä»¶"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} diff --git a/examples/optimization/remote_prompt_store/val.evalset.json b/examples/optimization/remote_prompt_store/val.evalset.json new file mode 100644 index 0000000..e3036b0 --- /dev/null +++ b/examples/optimization/remote_prompt_store/val.evalset.json @@ -0,0 +1,70 @@ +{ + "eval_set_id": "remote_prompt_store_val", + "name": "Remote prompt store demo - validation", + "description": "3 é“å°å­¦ē®—ęœÆåŗ”ē”Øé¢˜ļ¼›ē”ØäŗŽęÆč½®å…Øé‡čÆ„ä¼°ć€å†³å®šå€™é€‰ę˜Æå¦č¢«ęŽ„å—ć€‚", + "eval_cases": [ + { + "eval_id": "wp_seats_multiply", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [{"text": "ę•™å®¤é‡Œęœ‰ 5 ęŽ’åŗ§ä½ļ¼ŒęÆęŽ’ 8 äøŖļ¼Œäø€å…±å¤šå°‘äøŖåŗ§ä½ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š40 äøŖ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_water_weight", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [{"text": "已矄 1 å‡ę°“é‡ 1 åƒå…‹ļ¼Œ3.5 å‡ę°“é‡å¤šå°‘åƒå…‹ļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š3.5 千克"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "wp_class_girls_percent", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [{"text": "ē­é‡Œäø€å…±ęœ‰ 30 äŗŗļ¼Œå…¶äø­ 60% ę˜Æå„³ē”Ÿļ¼ŒčÆ·é—®ęœ‰å¤šå°‘åå„³ē”Ÿļ¼Ÿ"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ē­”ę”ˆļ¼š18 äŗŗ"}], + "role": "model" + } + } + ], + "session_input": { + "app_name": "remote_prompt_store_demo_agent", + "user_id": "validator", + "state": {} + } + } + ] +} diff --git a/examples/optimization/slo_runtime_control/README.md b/examples/optimization/slo_runtime_control/README.md new file mode 100644 index 0000000..b21b8e2 --- /dev/null +++ b/examples/optimization/slo_runtime_control/README.md @@ -0,0 +1,218 @@ +# SLO Runtime Control — å¤šé‡åœę­¢ę”ä»¶äø‹ēš„čæč”Œę—¶ SLO å®ˆé—Ø + +> **é€‚ē”Øåœŗę™Æ**:在 CI 流氓线 / å¤œé—“ēŖ—å£ē­‰å…·ęœ‰ē”¬ę€§ę—¶é—“å’Œčµ„ęŗēŗ¦ęŸēš„ēŽÆå¢ƒäø‹čæč”Œ prompt ä¼˜åŒ–ļ¼Œéœ€č¦"任何一个 SLO č§¦å‘éƒ½ē«‹åˆ»åœ"ēš„å¤šé‡åœę­¢ē­–ē•„ć€‚ęœ¬ example ę¼”ē¤ŗåŒę—¶åÆē”Ø SDK ęä¾›ēš„å…ØéƒØ 6 ē§ algorithm-level stop conditionsļ¼Œå¹¶é€ščæ‡ä»»åŠ”č®¾č®”č®©ä»»äø€ę”ä»¶éƒ½ęœ‰ęœŗä¼šęˆäøŗęœ€å…ˆč§¦å‘č€…ć€‚é˜…čÆ»å‰čÆ·å…ˆē†Ÿę‚‰ `quickstart/README.md` §2怂 + +## 1 Ā· é€‚ē”Øé—®é¢˜äøŽč®¾č®”ē›®ę ‡ + +ēŗæäøŠäøšåŠ”č·‘ prompt ä¼˜åŒ–ēš„ēŽ°å®žēŗ¦ęŸļ¼š + +- **时闓预算甬性**:CI 流氓线必锻 N åˆ†é’Ÿå†…ē»“ęŸļ¼Œč¶…å‡ŗå³å¤±č“„ +- **č°ƒē”Øé¢„ē®—ē”¬ę€§**:LLM åŽē«Æé…é¢ęŒ‰ęœˆč®”ē®—ļ¼Œå•ę¬”ä¼˜åŒ–äøčƒ½č·‘ēˆ†é¢„ē®— +- **候选池规樔**ļ¼šå†…å­˜ / čÆ„ä¼°ę—¶é—“éšå€™é€‰ę± å¢žé•æļ¼Œéœ€č¦äøŠé™ +- **ę—©åœēµę•åŗ¦**ļ¼ščæžē»­č‹„å¹²č½®ę— ę”¹å–„åŗ”äø»åŠØę”¾å¼ƒļ¼Œäøč€—č“¹å‰©ä½™é¢„ē®— + +å•äø€åœę­¢ę”ä»¶ę— ę³•åŒę—¶č¦†ē›–ä»„äøŠčÆ‰ę±‚ć€‚SDK ęä¾›ēš„ 6 ē§ algorithm-level stop conditions 滔足"OR 语义"ā€”ā€”ä»»ę„äø€ę”č§¦å‘å³åœę­¢ļ¼Œä½æäøšåŠ”åÆä»„å åŠ å¤šé‡ SLO怂 + +| 输兄 | 输出 | +| --- | --- | +| 6 ē§ stop condition ēš„é˜ˆå€¼ē»„åˆ | ę»”č¶³ęœ€å…ˆč§¦å‘ę”ä»¶ēš„ęœ€ä¼˜å€™é€‰ | +| `OptimizeResult.stop_reason` 字段 | å“Ŗę” SLO ęŠ¢é—øēš„ę˜Žē”®åé¦ˆ | + +### 本 example ę¼”ē¤ŗēš„ęœ€å°ē”Øä¾‹ + +| 结度 | 值 | +| --- | --- | +| 业劔任劔 | å®¢ęœå·„å•åˆ†ē±»ļ¼ˆč¾“å…„å·„å•ę–‡ęœ¬ļ¼Œč¾“å‡ŗ `{category, priority}` JSON) | +| ä¼˜åŒ–ē›®ę ‡ | `agent/prompts/system.md` å•ę–‡ä»¶ | +| éŖŒčÆęŒ‡ę ‡ | `final_response_avg_score`(exact åŒ¹é…č§„čŒƒåŒ– JSON) | +| 训练 / éŖŒčÆč§„ęØ” | 8 case / 4 case | +| 任劔难度 | č®­ē»ƒé›†äø­ę··å…„ 3 é“č¾¹ē•Œę··ę·†é¢˜ļ¼Œä½æ score_threshold äøä¼šå…ˆęŠ¢é—øļ¼Œčƒ½č§‚åÆŸå…¶ä»– stopper ēœŸå®žč”Œäøŗ | + +## 2 Ā· ęœÆčÆ­åÆ¹ē…§ + +ä»…åˆ—å‡ŗęœ¬ example å¼•å…„ēš„ę–°ę¦‚åæµć€‚åŸŗē”€ęœÆčÆ­č§ `quickstart/README.md` §2怂 + +| 术语 | 含义 | +| --- | --- | +| **algorithm-level stop condition** | GEPA ē®—ę³•å†…éƒØēš„åœę­¢åˆ¤å®šļ¼ˆå¦‚é¢„ē®—ć€č¶…ę—¶ć€ę— ę”¹å–„ļ¼‰ļ¼Œå†™åœØ `optimizer.json` ēš„ `algorithm` ꮵ怂 | +| **framework-level metric stop** | ä¼˜åŒ–å™Øę”†ęž¶åŸŗäŗŽ metric é˜ˆå€¼ēš„ę—©åœåˆ¤å®šļ¼Œå†™åœØ `optimizer.json` ēš„ `optimize.stop` ę®µļ¼ˆå¦‚ `required_metrics`)。 | +| **OR 语义** | 多个 stop condition åŒę—¶åÆē”Øę—¶ļ¼Œ**ä»»ę„äø€ę”č§¦å‘å³åœę­¢**ć€‚čæ™ę˜Æęœ¬ example ēš„ę øåæƒęœŗåˆ¶ć€‚ | +| **ęŠ¢é—ø** | 在多 stop condition OR čÆ­ä¹‰äø‹ļ¼Œęœ€å…ˆę»”č¶³ę”ä»¶ēš„é‚£ę”å†³å®šęœ€ē»ˆ `stop_reason`怂 | +| **å®Œęˆå½“å‰č½®å†åœ** | timeout ē­‰ę”ä»¶č§¦å‘åŽę”†ęž¶äøä¼šē«‹å³ kill 当前 roundļ¼Œč€Œę˜Æē­‰å½“å‰ round å®Œęˆļ¼ˆéæå…å€™é€‰ę•°ę®äø¢å¤±/ę±”ęŸ“ļ¼‰ć€‚ | + +## 3 Ā· čæč”Œē¤ŗä¾‹ + +### 3.1 å®‰č£…ä¾čµ– + +```bash +pip install -e ".[optimize]" +``` + +### 3.2 é…ē½®ēŽÆå¢ƒå˜é‡ + +```bash +export TRPC_AGENT_API_KEY="" +export TRPC_AGENT_BASE_URL="" +export TRPC_AGENT_MODEL_NAME="" +``` + +### 3.3 启动 + +```bash +python examples/optimization/slo_runtime_control/run_optimization.py +``` + +ē»ˆē«Æå°†č¾“å‡ŗęÆč½®åˆ†ę•°äøŽęœ€ē»ˆ `stop_reason`ļ¼Œę˜Žē”®å‘ŠēŸ„ę˜Æå“Ŗę” SLO č§¦å‘äŗ†åœę­¢ć€‚ + +### 3.4 äŗ§ē‰©ē»“ęž„ + +``` +runs// +ā”œā”€ā”€ result.json å…¶äø­ stop_reason å­—ę®µę ‡čÆ†ęŠ¢é—øč€… +ā”œā”€ā”€ summary.txt +ā”œā”€ā”€ baseline_prompts/ +ā”œā”€ā”€ best_prompts/ +└── rounds/ +``` + +## 4 Ā· ęž¶ęž„äøŽę•°ę®ęµ + +``` +optimizer.optimize() + │ + ā”œā”€ baseline 评估 + │ + └─ for each round: + ā”œā”€ GEPA åę€ → candidate prompt + ā”œā”€ 写兄 system.md + ā”œā”€ EvalConfig č§¦å‘ call_agent for each train sample + │ └─ create_agent() → Runner.run_async() → _normalize_response() + │ ↓ + │ final_response_avg_score(text.match=exact) + │ + └─ 6 äøŖ stopper åœØęÆč½®ē»“ęŸę—¶ę£€ęŸ„ļ¼ˆOR čÆ­ä¹‰ļ¼‰ļ¼š + wall_clock ≄ 90s ? + metric_calls ≄ 30 ? + no_improvement ≄ 3 č½® ? + best_score ≄ 1.0 ? + proposals ≄ 12 ? + tracked_candidates ≄ 5 ? + ↓ + ä»»ę„äø€ę”ę»”č¶³ → ē«‹å³ę”¶å°¾ļ¼Œstop_reason 写兄 OptimizeResult +``` + +### 4.1 ę–‡ä»¶ęø…å• + +| ꖇ件 | 角色 | ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ēš„äæ®ę”¹ę–¹å‘ | +| --- | --- | --- | +| `run_optimization.py` | ä¼˜åŒ–å™Øå…„å£ļ¼ˆå« `_normalize_response`) | äøŽ quickstart 同 | +| `agent/agent.py` | LlmAgent 巄厂 | ę›æę¢äøŗäøšåŠ” agent | +| `agent/prompts/system.md` | baseline prompt | å†™å…„äøšåŠ” baseline | +| `optimizer.json` | **ę øåæƒę”¹é€ ē‚¹**:6 stop condition é˜ˆå€¼ē»„åˆ | ęŒ‰äøšåŠ” SLO č°ƒę•“ęÆę”é˜ˆå€¼ | +| `train.evalset.json` / `val.evalset.json` | ę•°ę®é›† | ę›æę¢äøŗäøšåŠ”ē”Øä¾‹ | + +## 5 Ā· 6 ē§ stop condition 详解 + +| 字段 | 本 example 值 | gepa å†…éƒØę˜ å°„ | ęŠ¢é—øę”ä»¶ | é€‚ē”Øåœŗę™Æ | +| --- | --- | --- | --- | --- | +| `max_metric_calls` | 30 | `MaxMetricCallsStopper` | 瓯讔 case 评估欔数 ≄ 30 | LLM é…é¢ē”¬äøŠé™ | +| `max_iterations_without_improvement` | 3 | `NoImprovementStopper` | čæžē»­ N č½® best valset ę— ęå‡ | ä¼˜åŒ–å·²ę”¶ę•›ęˆ–é™·å…„å±€éƒØęœ€ä¼˜ę—¶äø»åŠØę”¾å¼ƒ | +| `timeout_seconds` | 90.0 | `TimeoutStopCondition` | wall-clock ≄ N ē§’ | CI ęµę°“ēŗæę—¶é—“ēŖ—ē”¬ēŗ¦ęŸ | +| `score_threshold` | 1.0 | `ScoreThresholdStopper` | best valset pass_rate ≄ 阈值 | å·²č¾¾äøšåŠ”ē›®ę ‡ļ¼Œę— éœ€ē»§ē»­ | +| `max_candidate_proposals` | 12 | `MaxCandidateProposalsStopper` | reflection LM ē“Æč®”ęč®®ę¬”ę•° ≄ N | é™åˆ¶åę€ LM č°ƒē”Øé¢„ē®— | +| `max_tracked_candidates` | 5 | `MaxTrackedCandidatesStopper` | Pareto å‰ę²æå€™é€‰ę± å¤§å° ≄ N | ęŽ§åˆ¶å†…å­˜äøŽ merge 候选空闓规樔 | + +### 5.1 č‡³å°‘é… 1 äøŖ + +`optimizer.json` äø­č‡³å°‘é…ē½®äøŠčæ° 6 äøŖå­—ę®µäø­ēš„ 1 äøŖļ¼Œå¦åˆ™ę”†ęž¶åÆåŠØęœŸ `_require_at_least_one_stop_condition` ꊄ错怂**å¤šäøŖåŒę—¶åÆē”Øå³ OR 语义**ā€”ā€”ä»»äø€č§¦å‘ē«‹å³åœę­¢ć€‚ + +### 5.2 ę˜¾å¼ē¦ē”Ø framework-level metric ę—©åœ + +```jsonc +{ + "optimize": { + "stop": { + "required_metrics": [] // ę˜¾å¼ē¦ē”Øę”†ęž¶å±‚ metric ę—©åœ + }, + "algorithm": { + "max_metric_calls": 30, + "max_iterations_without_improvement": 3, + "timeout_seconds": 90.0, + "score_threshold": 1.0, + "max_candidate_proposals": 12, + "max_tracked_candidates": 5 + } + } +} +``` + +`required_metrics: []` 让 6 äøŖ algorithm ēŗ§ stopper ē‹¬å  stop å†³ē­–ęƒā€”ā€”éæå…ę”†ęž¶å±‚åœØ algorithm å±‚ä¹‹å‰ęå‰ē»ˆę­¢ļ¼Œå½±å“åÆ¹åŗ•å±‚ stopper č”Œäøŗēš„č§‚åÆŸć€‚ + +äøšåŠ”ēœŸå®žä½æē”Øę—¶ę˜Æå¦ē¦ē”Ø framework-level ę—©åœå–å†³äŗŽčÆ‰ę±‚ļ¼š + +- ä»…å…³åæƒ algorithm-level ę—¶åŗäøŽå¼€é”€ęŽ§åˆ¶ → ē¦ē”Øļ¼ˆęœ¬ example ēš„é€‰ę‹©ļ¼‰ +- åŒę—¶å…³åæƒ metric ę˜Æå¦č¾¾ę ‡ → 启用 `required_metrics: "all"` ęˆ–å…·ä½“ metric åˆ—č”Øļ¼ˆå‚č§ quickstart §5) + +## 6 Ā· å…³é”®é…ē½® + +### 6.1 timeout äøę˜Æ hard kill + +`timeout_seconds=90` č§¦å‘åŽę”†ęž¶äøä¼šē«‹å³ kill å½“å‰ę­£åœØč·‘ēš„ roundļ¼Œč€Œę˜Æē­‰å½“å‰ round ē»“ęŸć€‚å®žé™…ē»ˆę­¢ę—¶é—“é€šåøøč¶…čæ‡č®¾å®šå€¼ć€‚åŽŸå› ļ¼šäø­é€” kill ä¼šåÆ¼č‡“å€™é€‰ę•°ę®äø¢å¤± / ę–‡ä»¶å†™å…„ęˆŖę–­ć€‚ + +**äøšåŠ”é¢åŗ”åÆ¹**: + +- č‹„ SLO 是**甬截止**ļ¼ˆå¦‚ CI 流氓线必锻 N åˆ†é’Ÿå†…ē»“ęŸļ¼‰ļ¼ŒęŠŠ `timeout_seconds` č®¾äøŗēœŸå®žēŖ—å£ēš„äø€åŠå·¦å³ē•™å‡ŗē¼“å†² +- å•č½®å…øåž‹č€—ę—¶ē”± LLM č°ƒē”Øé€Ÿåŗ¦å†³å®šć€‚åÆé€ščæ‡ē¼©å° `reflection_minibatch_size` / `eval_case_parallelism` ęŽ§åˆ¶å•č½®ę—¶é•æ + +### 6.2 é˜ˆå€¼ä¹‹é—“ēš„ē›øåÆ¹å…³ē³» + +é˜ˆå€¼ä¹‹é—“åŗ”äæęŒč‡Ŗę“½ļ¼Œå¦åˆ™éƒØåˆ† stopper ę°øčæœäøä¼šč§¦å‘ļ¼š + +| 关系 | 含义 | +| --- | --- | +| `max_metric_calls > reflection_minibatch_size Ɨ max_iterations_without_improvement` | 否则 no_improvement ę°øčæœå…ˆęŠ¢é—ø | +| `timeout_seconds > å•č½®å…øåž‹č€—ę—¶ Ɨ 2` | 否则 timeout 在第 1 č½®å°±č§¦å‘ļ¼Œēœ‹äøåˆ°ä¼˜åŒ–čæ›å±• | +| `max_candidate_proposals ≄ 1` | 至少要让 reflection LM 跑过一欔 | +| `max_tracked_candidates ≄ 2` | 否则 Pareto å‰ę²æę— ę³•äæē•™å¤šäŗŽ baseline ēš„å€™é€‰ | + +### 6.3 `_normalize_response` ēš„å¤ē”Ø + +äøŽ `blackbox_cli/` example å®Œå…Øē›øåŒēš„č§„čŒƒåŒ–é€»č¾‘ļ¼šē”Ø `json.dumps(sort_keys=True, ensure_ascii=False, separators=(",", ":"))` 把 LLM č¾“å‡ŗč½¬ę¢äøŗå”Æäø€å­—ē¬¦äø²å½¢ę€ļ¼Œä½æ `final_response_avg_score(text.match=exact)` åÆē›“ęŽ„čµ°ē²¾ē”®åŒ¹é…ļ¼Œ**čÆ„ęµ‹å±‚äøä¾čµ– LLM judge**ā€”ā€”čæ™åÆ¹čæč”Œę—¶ęŽ§åˆ¶åœŗę™Æč‡³å…³é‡č¦ļ¼Œéæå… judge č°ƒē”Øå¼•å…„é¢å¤–äøē”®å®šę€§äøŽę—¶é—“å¼€é”€ć€‚ + +## 7 Ā· åøøč§é—®é¢˜ + +**Q:`stop_reason` å­—ę®µå€¼ęœ‰å“Ŗäŗ›ļ¼Ÿ** +Aļ¼šåøøč§å–å€¼åŒ…ę‹¬ `score_threshold_reached` / `budget_exhausted` / `timeout_reached` / `no_improvement` / `max_proposals_reached` / `max_tracked_candidates_reached` / `user_requested_stop`ļ¼ˆē”± `optimize.stop` ę–‡ä»¶č§¦å‘ļ¼‰ć€‚å…·ä½“å–å€¼ē”±č§¦å‘ēš„ stopper å†³å®šć€‚ + +**Qļ¼šč§¦å‘ timeout åŽäŗ§ē‰©å®Œę•“å—ļ¼Ÿ** +Aļ¼šå®Œę•“ć€‚"å®Œęˆå½“å‰č½®å†åœ"čÆ­ä¹‰äæčÆå½“å‰č½®ēš„ round_*.json态result.json态summary.txt éƒ½å·²å†™å…„ć€‚äø­é€”äø­ę­¢ä»…äø¢å¼ƒå°šęœŖå¼€å§‹ēš„äø‹äø€č½®ć€‚ + +**Qļ¼ščƒ½å¦č°ƒę•“ stop condition ēš„ä¼˜å…ˆēŗ§ļ¼Ÿ** +Aļ¼šäøčƒ½ć€‚å¤š stop condition é—“ę˜Æ OR čÆ­ä¹‰äø”åŒę­„ę£€ęŸ„ļ¼Œęœ€å…ˆę»”č¶³ę”ä»¶ēš„ stopper 决定 `stop_reason`ć€‚éœ€č¦"ä¼˜å…ˆēœ‹ timeout,timeout ä¹‹å†…å°½é‡č·‘é«˜ score"čæ™ē§čÆ­ä¹‰ę—¶ļ¼Œåŗ”ęŠŠę¬”č¦ stopper ēš„é˜ˆå€¼ę”¾å®½åˆ°ę°øčæœäøä¼šå…ˆč§¦å‘ć€‚ + +**Qļ¼šå•č½®å·²ē»č¶…čæ‡ timeout äŗ†ę€Žä¹ˆåŠžļ¼Ÿ** +Aļ¼šä»ä¼šē­‰å½“å‰č½®č·‘å®Œę‰åœę­¢ć€‚č‹„čÆ„č½®č·‘å¾—å¤Ŗä¹…ļ¼ˆå¦‚ LLM å”ä½ļ¼‰ļ¼ŒåÆåœØ `call_agent` å†…éƒØåÆ¹ LLM č°ƒē”ØåŠ č¶…ę—¶ļ¼ˆč§ `blackbox_cli/agent/call_agent.py` ēš„ `CLI_TIMEOUT_SEC` ęØ”å¼ļ¼‰ć€‚ + +**Qļ¼šäøšåŠ”é‡ŒåŖå…³åæƒ timeoutļ¼Œå…¶ä»–äøé™ę€Žä¹ˆé…ļ¼Ÿ** +Aļ¼šä»…č®¾ `timeout_seconds=<ē§’ę•°>`ļ¼Œå…¶ä½™ 5 äøŖå­—ę®µäøå†™å³åÆļ¼ˆé»˜č®¤ē¦ē”Øļ¼‰ć€‚ä½†éœ€ę³Øę„č‡³å°‘é… 1 äøŖ stopper怂 + +## 8 Ā· å®žéŖŒå»ŗč®®ļ¼šč®©å…¶ä»– stopper ęŠ¢é—ø + +é€ščæ‡č°ƒę•“é˜ˆå€¼ē»„åˆåÆä»„č§‚åÆŸäøåŒ stopper ēš„ēœŸå®žč”Œäøŗć€‚åÆä½œäøŗäøšåŠ”č°ƒå‚å‚č€ƒļ¼š + +| ęƒ³ēœ‹å“Ŗę” stopper ęŠ¢é—ø | é˜ˆå€¼č°ƒę•“ę–¹å‘ | +| --- | --- | +| `score_threshold` | 把 baseline 写得"ēŗ¦ęŸę›“ē“§"让 GEPA å®¹ę˜“č¾¾ 1.0ļ¼›ęˆ–ęŠŠ score_threshold 调到 0.7 | +| `max_metric_calls` | 把 timeout_seconds č°ƒé«˜ļ¼ˆå¦‚ 600)+ minibatch č°ƒå°č®©čÆ„ä¼°é€Ÿåŗ¦åæ« | +| `max_iterations_without_improvement` | timeout_seconds 调高 + 任劔设讔成"éš¾ä»„å†ęå‡"ēš„č¾¹ē•Œåœŗę™Æ | +| `max_candidate_proposals` | 调到 2态timeout=300 | +| `max_tracked_candidates` | 调到 2态timeout=300态`frontier_type="hybrid"`ļ¼ˆå¤šå€™é€‰å¹¶å­˜ļ¼‰ | + +äøšåŠ”ēœŸå®žęŽ„å…„ę­„éŖ¤ļ¼š + +1. ęµ‹é‡å…øåž‹äøšåŠ”č“Ÿč½½äø‹å•č½®č€—ę—¶äøŽå•č½® metric_calls ꕰ +2. ꌉ SLO åęŽØęÆäøŖ stopper ēš„åˆē†é˜ˆå€¼ļ¼ˆå¦‚ CI 5min → timeout=180s ē•™ 60s 缓冲) +3. č·‘äø€ę¬”åŸŗå‡†å®žéŖŒč§‚åÆŸ `stop_reason` ę˜Æå¦å¦‚ęœŸ +4. ę ¹ę®å®žé™…č”Œäøŗå¾®č°ƒé˜ˆå€¼ + +> äøšåŠ”ēœŸå®žęŽ„å…„ę—¶äøč¦å¤åˆ¶ęœ¬ example ēš„ 6 äøŖå€¼ā€”ā€”ęœ¬ example ēš„é˜ˆå€¼ę˜Æäøŗ"ę¼”ē¤ŗę•ˆęžœåÆč§"č€Œč®¾ļ¼Œå®žé™…äøšåŠ”åŗ”ę ¹ę® LLM åŽē«Æé€Ÿåŗ¦ć€ę•°ę®é›†č§„ęØ”ć€SLO ēŖ—å£åęŽØć€‚ diff --git a/examples/optimization/slo_runtime_control/agent/__init__.py b/examples/optimization/slo_runtime_control/agent/__init__.py new file mode 100644 index 0000000..c3f6407 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/__init__.py @@ -0,0 +1,10 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""SLO runtime control demo agent — å®¢ęˆ·å·„å•åˆ†ē±»ć€‚""" + +from .agent import SYSTEM_PROMPT_PATH, create_agent + +__all__ = ["SYSTEM_PROMPT_PATH", "create_agent"] diff --git a/examples/optimization/slo_runtime_control/agent/agent.py b/examples/optimization/slo_runtime_control/agent/agent.py new file mode 100644 index 0000000..6a93096 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/agent.py @@ -0,0 +1,47 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""å®¢ęˆ·å·„å•åˆ†ē±» agent —— SLO Runtime Control example 专用。 + +ęÆę¬” create_agent() é‡čÆ» prompts/system.mdļ¼Œä½æä¼˜åŒ–å™Øå†™å…„ēš„ę–°å€™é€‰ē«‹å³ē”Ÿę•ˆć€‚ +å•ę–‡ä»¶ä¼˜åŒ–ē›®ę ‡ć€‚ +""" + +from pathlib import Path + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel, OpenAIModel +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config + + +SYSTEM_PROMPT_PATH = Path(__file__).parent / "prompts" / "system.md" + + +def _create_model() -> LLMModel: + """ęž„å»ŗ OpenAI 兼容 chat ęØ”åž‹å®žä¾‹ć€‚""" + api_key, base_url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url) + + +def _read_instruction() -> str: + """ä»Žē£ē›˜é‡čÆ» system.md怂""" + return SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + + +def create_agent() -> LlmAgent: + """ęž„å»ŗäø€äøŖä½æē”Øå½“å‰ē£ē›˜ prompt ēš„ę–° LlmAgent å®žä¾‹ć€‚""" + return LlmAgent( + name="ticket_classifier_agent", + description="A customer-service ticket classifier under multi-stop SLO control.", + model=_create_model(), + instruction=_read_instruction(), + generate_content_config=GenerateContentConfig( + temperature=0.2, + top_p=0.9, + max_output_tokens=512, + ), + ) diff --git a/examples/optimization/slo_runtime_control/agent/config.py b/examples/optimization/slo_runtime_control/agent/config.py new file mode 100644 index 0000000..d0a64b1 --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/config.py @@ -0,0 +1,33 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""ęØ”åž‹å‡­ę®čÆ»å– —— ä»ŽēŽÆå¢ƒå˜é‡åŠ č½½ OpenAI 兼容 LLM ēš„čæžęŽ„äæ”ęÆć€‚ + +éœ€č¦ēš„ēŽÆå¢ƒå˜é‡ +-------------- + TRPC_AGENT_API_KEY LLM åŽē«Æēš„ API key + TRPC_AGENT_BASE_URL LLM åŽē«Æēš„ endpoint + TRPC_AGENT_MODEL_NAME ęØ”åž‹å + +ē¼ŗä»»ę„äø€äøŖå°±ē«‹å³ęŠ› ValueErrorļ¼Œéæå…čæč”Œåˆ°äø€åŠę‰ę’žåˆ° LLM åŽē«Æēš„ 401 é”™čÆÆļ¼Œ +é‚£ę—¶ęŠ„é”™äæ”ęÆä¼šå¾ˆęœ‰čæ·ęƒ‘ę€§ļ¼ˆēœ‹čµ·ę„åƒ prompt å†™é”™äŗ†ļ¼Œå®žé™…ę˜Æå‡­ę®ę²”é…ļ¼‰ć€‚ +""" + +from __future__ import annotations + +import os + + +def get_model_config() -> tuple[str, str, str]: + """čæ”å›ž (api_key, base_url, model_name)ļ¼›ä»»äø€ē¼ŗå¤±ē«‹åˆ»ęŠ„é”™ć€‚""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "čæč”Œä¼˜åŒ–å™Øå‰åæ…é”»é…ē½®ēŽÆå¢ƒå˜é‡ TRPC_AGENT_API_KEY / " + "TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME怂" + ) + return api_key, base_url, model_name diff --git a/examples/optimization/slo_runtime_control/agent/prompts/system.md b/examples/optimization/slo_runtime_control/agent/prompts/system.md new file mode 100644 index 0000000..602d9eb --- /dev/null +++ b/examples/optimization/slo_runtime_control/agent/prompts/system.md @@ -0,0 +1 @@ +ä½ åø®ē”Øęˆ·åˆ†ē±»å·„å•ć€‚ diff --git a/examples/optimization/slo_runtime_control/optimizer.json b/examples/optimization/slo_runtime_control/optimizer.json new file mode 100644 index 0000000..339d95d --- /dev/null +++ b/examples/optimization/slo_runtime_control/optimizer.json @@ -0,0 +1,48 @@ +{ + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": { + "match": "exact", + "case_insensitive": false + } + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 1, + "stop": { + "required_metrics": [] + }, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": { "max_tokens": 4096, "temperature": 0.6 } + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "frontier_type": "instance", + "reflection_minibatch_size": 3, + "reflection_history_top_k": 2, + "skip_perfect_score": false, + "use_merge": false, + "max_metric_calls": 30, + "max_iterations_without_improvement": 3, + "timeout_seconds": 90.0, + "score_threshold": 1.0, + "max_candidate_proposals": 12, + "max_tracked_candidates": 5 + } + } +} diff --git a/examples/optimization/slo_runtime_control/run_optimization.py b/examples/optimization/slo_runtime_control/run_optimization.py new file mode 100644 index 0000000..0ef65d0 --- /dev/null +++ b/examples/optimization/slo_runtime_control/run_optimization.py @@ -0,0 +1,143 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""SLO Runtime Control example ēš„ä¼˜åŒ–å™Øå…„å£ć€‚ + +é€‚ē”Øåœŗę™Æ +-------- +在 CI 流氓线 / å¤œé—“ēŖ—å£ē­‰å…·ęœ‰ē”¬ę€§ę—¶é—“å’Œčµ„ęŗēŗ¦ęŸēš„ēŽÆå¢ƒäø‹čæč”Œ prompt ä¼˜åŒ–ļ¼Œ +éœ€č¦"任何一个 SLO č§¦å‘éƒ½ē«‹åˆ»åœ"ēš„å¤šé‡åœę­¢ē­–ē•„ć€‚ęœ¬č„šęœ¬ę¼”ē¤ŗåŒę—¶åÆē”Ø SDK +ęä¾›ēš„ 6 ē§ algorithm-level stop conditions,OR čÆ­ä¹‰ęŠ¢é—øć€‚ + +čæ™äøŖę–‡ä»¶åšä»€ä¹ˆ +-------------- +1. ę³Øå†Œå•å­—ę®µ TargetPrompt(agent/prompts/system.md) +2. 定义 call_agentļ¼šē”Ø _normalize_response 把 LLM č¾“å‡ŗč§„čŒƒåŒ–äøŗēØ³å®š JSON + å­—ē¬¦äø²ļ¼Œä½æ final_response_avg_score čµ° text exact č€Œéžä¾čµ– LLM judge +3. 调 AgentOptimizer.optimizeļ¼›6 ē§ stop condition é˜ˆå€¼åœØ optimizer.json äø­ + +ę€Žä¹ˆč·‘ +------ +1) 配 TRPC_AGENT_API_KEY / TRPC_AGENT_BASE_URL / TRPC_AGENT_MODEL_NAME +2) python examples/optimization/slo_runtime_control/run_optimization.py +3) ēœ‹ runs/<ę—¶é—“ęˆ³>/result.json äø­ēš„ stop_reason å­—ę®µļ¼ŒčÆ†åˆ«å“Ŗę” SLO ęŠ¢é—ø + +ęŽ„å…„č‡Ŗęœ‰äøšåŠ”ę—¶ę”¹å“Ŗé‡Œ +-------------------- +- optimizer.json äø­ 6 äøŖ stop condition é˜ˆå€¼ęŒ‰äøšåŠ” SLO åęŽØ + (详见 README §5 äøŽ §8) +- agent/agent.py ę”¹äøŗäøšåŠ” agent +- _normalize_response ęŒ‰äøšåŠ”č¾“å‡ŗę ¼å¼č°ƒę•“ļ¼ˆäøšåŠ”éž JSON č¾“å‡ŗåÆę•“ä½“ę›æę¢ļ¼‰ +""" + +from __future__ import annotations + +import asyncio +import json +import re +import sys +import uuid +from datetime import datetime +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_REPO_ROOT = _HERE.parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt # noqa: E402 +from trpc_agent_sdk.runners import Runner # noqa: E402 +from trpc_agent_sdk.sessions import InMemorySessionService # noqa: E402 +from trpc_agent_sdk.types import Content, Part # noqa: E402 + +from agent.agent import SYSTEM_PROMPT_PATH, create_agent # noqa: E402 + + +CONFIG_PATH = _HERE / "optimizer.json" +TRAIN_PATH = _HERE / "train.evalset.json" +VAL_PATH = _HERE / "val.evalset.json" +RUNS_DIR = _HERE / "runs" +APP_NAME = "slo_runtime_control_agent" + + +_JSON_OBJECT_RE = re.compile(r"\{.*\}", re.DOTALL) + + +def _normalize_response(raw: str) -> str: + """把 LLM č‡Ŗē”±ę–‡ęœ¬č§„čŒƒåŒ–ęˆäøŽ reference å®Œå…Øäø€č‡“ēš„å­—ē¬¦äø²å½¢ę€ć€‚ + + äøŽ blackbox_cli / ci_integration å®Œå…Øē›øåŒēš„č§„čŒƒåŒ–é€»č¾‘ļ¼šč®© + final_response_avg_score(text.match=exact) ē›“ęŽ„čµ°ē²¾ē”®åŒ¹é…ļ¼Œ + éæå… LLM judge å¼•å…„é¢å¤–äøē”®å®šę€§äøŽę—¶é—“å¼€é”€ā€”ā€”čæ™åÆ¹čæč”Œę—¶ SLO + ęŽ§åˆ¶åœŗę™Æč‡³å…³é‡č¦ć€‚ + """ + text = (raw or "").strip() + if not text: + return "" + match = _JSON_OBJECT_RE.search(text) + if not match: + return text + try: + parsed = json.loads(match.group(0)) + except json.JSONDecodeError: + return text + return json.dumps(parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")) + + +async def call_agent(query: str) -> str: + """ę”†ęž¶å›žč°ƒļ¼šč·‘äø€ę¬”ęŽØē†ļ¼Œč¾“å‡ŗē» _normalize_response č§„čŒƒåŒ–ć€‚ + + ęÆäøŖ case äø€ä»½ē‹¬ē«‹ēš„ Runner + InMemorySessionServiceļ¼ŒäæčÆå¹¶å‘čÆ„ęµ‹ę—¶ + session state äøäŗ’ē›øę±”ęŸ“ć€‚ + """ + root_agent = create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=root_agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, user_id=user_id, session_id=session_id, state={} + ) + user_content = Content(role="user", parts=[Part.from_text(text=query)]) + + final_text = "" + async for event in runner.run_async( + user_id=user_id, session_id=session_id, new_message=user_content + ): + if not event.is_final_response(): + continue + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + if part.thought: + continue + if part.text: + final_text += part.text + return _normalize_response(final_text) + + +async def main() -> None: + """组装 TargetPrompt + 调 AgentOptimizer.optimize怂""" + target = TargetPrompt().add_path("system_prompt", str(SYSTEM_PROMPT_PATH)) + + timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + output_dir = RUNS_DIR / timestamp + + await AgentOptimizer.optimize( + config_path=str(CONFIG_PATH), + call_agent=call_agent, + target_prompt=target, + train_dataset_path=str(TRAIN_PATH), + validation_dataset_path=str(VAL_PATH), + output_dir=str(output_dir), + update_source=False, + verbose=1, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/optimization/slo_runtime_control/train.evalset.json b/examples/optimization/slo_runtime_control/train.evalset.json new file mode 100644 index 0000000..ee85394 --- /dev/null +++ b/examples/optimization/slo_runtime_control/train.evalset.json @@ -0,0 +1,239 @@ +{ + "eval_set_id": "slo_runtime_train", + "name": "SLO runtime control demo - train", + "description": "8 å®¢ęœå·„å•ć€‚category in {account,billing,technical,feedback}; priority in {low,normal,high}. final_response å·²č§„čŒƒåŒ–äøŗē“§å‡‘ JSON äøŽ _normalize_response č¾“å‡ŗę ¼å¼äø€č‡“ļ¼Œē”Ø final_response_avg_score(text.match=exact) åšē”¬ęÆ”åÆ¹ć€‚ę··å…„ t5/t6/t8 äø‰é“č¾¹ē•Œę··ę·†é¢˜č®©ęØ”åž‹äøę˜“äø€ę¬”åˆ° 1.0ļ¼Œčæ«ä½æå…¶ä»– stopper(timeout/iterations/proposalsļ¼‰ęœ‰ęœŗä¼šå…ˆęŠ¢é—øć€‚", + "eval_cases": [ + { + "eval_id": "ticket_t1", + "conversation": [ + { + "invocation_id": "t1", + "user_content": { + "parts": [ + { + "text": "ęˆ‘åæ˜č®°äŗ†ē™»å½•åÆ†ē ļ¼Œę€Žä¹ˆę‰¾å›žļ¼Ÿ" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"normal\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t2", + "conversation": [ + { + "invocation_id": "t2", + "user_content": { + "parts": [ + { + "text": "äøŠäøŖęœˆč¢«å¤šę‰£äŗ† 99 å…ƒčÆč“¹ļ¼ŒčÆ·é€€ę¬¾ć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"billing\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t3", + "conversation": [ + { + "invocation_id": "t3", + "user_content": { + "parts": [ + { + "text": "App åÆåŠØåŽē«‹åˆ»é—Ŗé€€ļ¼Œå·²é‡č£…äø¤ę¬”éƒ½äøč”Œć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"technical\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t4", + "conversation": [ + { + "invocation_id": "t4", + "user_content": { + "parts": [ + { + "text": "åøŒęœ›å¢žåŠ ę·±č‰²ęØ”å¼ļ¼Œēœ¼ē›ēœ‹äŗ®å±å¤Ŗē“Æć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"feedback\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t5", + "conversation": [ + { + "invocation_id": "t5", + "user_content": { + "parts": [ + { + "text": "ęˆ‘č“¦å·č¢«ē›—äŗ†ļ¼Œē™»å½•ęē¤ŗč®¾å¤‡å¼‚åøøčÆ·ę±‚éŖŒčÆē ä¹Ÿę”¶äøåˆ°ć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t6", + "conversation": [ + { + "invocation_id": "t6", + "user_content": { + "parts": [ + { + "text": "ē»­č“¹ęŒ‰é’®ē‚¹äŗ†ę²”ååŗ”ļ¼Œä½†é“¶č”ŒēŸ­äæ”ę˜¾ē¤ŗå·²ę‰£ę¬¾ć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"billing\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t7", + "conversation": [ + { + "invocation_id": "t7", + "user_content": { + "parts": [ + { + "text": "ä½ ä»¬ēš„å®¢ęœę€åŗ¦å¾ˆå„½ļ¼Œē»™äøŖčµžć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"feedback\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + }, + { + "eval_id": "ticket_t8", + "conversation": [ + { + "invocation_id": "t8", + "user_content": { + "parts": [ + { + "text": "ę‰£č“¹å¤±č“„ä½†å®¢ęˆ·čÆ“ęƒ³å–ę¶ˆč“¦ęˆ·ļ¼Œäø‹äøŖęœˆčæ˜č¦ē»§ē»­å—ļ¼Ÿ" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"normal\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "trainer", + "state": {} + } + } + ] +} \ No newline at end of file diff --git a/examples/optimization/slo_runtime_control/val.evalset.json b/examples/optimization/slo_runtime_control/val.evalset.json new file mode 100644 index 0000000..408b3af --- /dev/null +++ b/examples/optimization/slo_runtime_control/val.evalset.json @@ -0,0 +1,123 @@ +{ + "eval_set_id": "slo_runtime_val", + "name": "SLO runtime control demo - val", + "description": "4 é“ē•™å‡ŗéŖŒčÆ case,含 1 é“č¾¹ē•Œę··ę·†ļ¼ˆv1 billing vs technical)。", + "eval_cases": [ + { + "eval_id": "ticket_v1", + "conversation": [ + { + "invocation_id": "v1", + "user_content": { + "parts": [ + { + "text": "äæ”ē”Øå”ę”Æä»˜äø€ē›“č½¬åœˆęœ€åŽå¤±č“„ļ¼Œč“¦å•é”µé¢ä¹Ÿę‰“äøå¼€ć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"technical\",\"priority\":\"high\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "ticket_v2", + "conversation": [ + { + "invocation_id": "v2", + "user_content": { + "parts": [ + { + "text": "åøŒęœ›ę”ÆęŒå¾®äæ”å’Œę”Æä»˜å®åŒę—¶ē»‘å®šć€‚" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"feedback\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "ticket_v3", + "conversation": [ + { + "invocation_id": "v3", + "user_content": { + "parts": [ + { + "text": "å¦‚ä½•äæ®ę”¹ęˆ‘ēš„ę³Øå†Œę‰‹ęœŗå·ļ¼Ÿ" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"account\",\"priority\":\"normal\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + }, + { + "eval_id": "ticket_v4", + "conversation": [ + { + "invocation_id": "v4", + "user_content": { + "parts": [ + { + "text": "12 ęœˆä»½å‘ē„Øę€Žä¹ˆå¼€ļ¼Ÿ" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "{\"category\":\"billing\",\"priority\":\"low\"}" + } + ], + "role": "model" + } + } + ], + "session_input": { + "app_name": "slo_runtime_control_agent", + "user_id": "validator", + "state": {} + } + } + ] +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c4e0e85..6c47d4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,11 @@ eval = [ "tabulate", ] +optimize = [ + "gepa>=0.0.7", + "rich>=13.0.0", +] + mem0 = [ "mem0ai>=1.0.3", "sentence-transformers", @@ -151,6 +156,8 @@ all = [ "wecom-aibot-sdk-python>=0.1.5", "a2a-sdk<1.0.0,>=0.3.22", "e2b-code-interpreter>=2.0.0", + "gepa>=0.0.7", + "rich>=13.0.0", ] [project.scripts] diff --git a/requirements-test.txt b/requirements-test.txt index 123052a..c54dad0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -10,6 +10,8 @@ unittest-xml-reporting rouge-score pandas tabulate +gepa>=0.0.7 +rich>=13.0.0 # Test DB greenlet diff --git a/requirements.txt b/requirements.txt index 4360f19..5e7264d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -50,3 +50,5 @@ charset-normalizer>=3.0.0 litellm>=1.75.5 mempalace==3.3.4 json-repair>=0.40.0 +gepa>=0.0.7 +rich>=13.0.0 diff --git a/tests/evaluation/test_agent_optimizer.py b/tests/evaluation/test_agent_optimizer.py new file mode 100644 index 0000000..d7b061f --- /dev/null +++ b/tests/evaluation/test_agent_optimizer.py @@ -0,0 +1,1285 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for AgentOptimizer facade dispatch.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._agent_optimizer import AgentOptimizer +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_set import EvalSet +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import GepaReflectiveOptimizer +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1") -> EvalCase: + return EvalCase(eval_id=eval_id, conversation=[_invocation("hi", "ack")]) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _new_target_prompt(recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + state = recorder if recorder is not None else {} + + async def read_cb() -> str: + return state.get("instruction", "initial") + + async def write_cb(value: str) -> None: + state["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +class _FakeGEPAResult: + def __init__(self, candidates, val_scores): + self.candidates = candidates + self.val_aggregate_scores = val_scores + self.parents = [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = [0] * len(candidates) + self.total_metric_calls = 0 + self.best_outputs_valset = None + + @property + def best_idx(self) -> int: + return max( + range(len(self.val_aggregate_scores)), + key=lambda i: self.val_aggregate_scores[i], + ) + + +def _write_config_file( + tmp_path, + algo_name: str = "gepa_reflective", + *, + extra_algo: Optional[dict] = None, +) -> str: + """Write a valid optimizer.json file to tmp_path and return its path. + + ``extra_algo`` is merged into the algorithm block to override or add + optional fields (e.g. ``use_merge``). + """ + import json + algo_block = { + "name": algo_name, + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 30, + } + if extra_algo: + algo_block.update(extra_algo) + payload = { + "evaluate": { + "metrics": [{"metric_name": "m1", "threshold": 0.7}], + "num_runs": 1, + }, + "optimize": { + "algorithm": algo_block, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + return str(config_path) + + +@pytest.mark.asyncio +async def test_facade_reads_config_file_and_dispatches(tmp_path, monkeypatch): + """End-to-end: AgentOptimizer.optimize(config_path=...) reads the file + dispatches.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "test1"), + update_source=True, + verbose=0, + ) + + assert result.status == "SUCCEEDED" + assert result.best_pass_rate == pytest.approx(0.9) + assert result.best_prompts == {"instruction": "improved"} + assert recorder["instruction"] == "improved" + + +@pytest.mark.asyncio +async def test_facade_unknown_algorithm_raises_valueerror(tmp_path): + """If config.optimize.algorithm.name is not registered, raise ValueError listing options.""" + import json + payload = { + "evaluate": {"metrics": [{"metric_name": "m1", "threshold": 0.7}], "num_runs": 1}, + "optimize": { + "algorithm": { + "name": "no_such_algorithm", + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 30, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "test_unknown"), + verbose=0, + ) + + msg = str(exc_info.value) + assert "no_such_algorithm" in msg + + +@pytest.mark.asyncio +async def test_facade_unknown_algorithm_lists_available_algorithms(tmp_path): + """API-A1: error message must enumerate registered algorithms so the user + can see what they should have written instead. Previously pydantic's + literal_error fired first and produced 'Input should be ...' without + listing alternatives.""" + import json + payload = { + "evaluate": {"metrics": [{"metric_name": "m1", "threshold": 0.7}], "num_runs": 1}, + "optimize": { + "algorithm": { + "name": "gepa_reflactive", # typo of gepa_reflective + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 30, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "typo_check"), + verbose=0, + ) + + msg = str(exc_info.value) + # Friendly enumeration: must include both the typo and at least one + # registered algorithm so users see what to type. + assert "gepa_reflactive" in msg + assert "Available algorithms" in msg + assert "gepa_reflective" in msg + + +@pytest.mark.asyncio +async def test_facade_missing_config_file_raises(tmp_path): + """If config_path does not exist, propagate FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + await AgentOptimizer.optimize( + config_path=str(tmp_path / "nonexistent.json"), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "missing"), + verbose=0, + ) + + +def test_facade_is_exported_from_evaluation_package(): + import trpc_agent_sdk.evaluation as ev + assert ev.AgentOptimizer is AgentOptimizer + + +@pytest.mark.asyncio +async def test_facade_persists_artifacts_under_output_dir(tmp_path, monkeypatch): + """The facade must materialise result.json, summary.txt, rounds/*.json, + baseline_prompts/, best_prompts/, config.snapshot.json and run.log under + output_dir for every successful run.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + output_dir = tmp_path / "runs" / "artifact_check" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + verbose=0, + ) + + assert (output_dir / "result.json").is_file() + assert (output_dir / "summary.txt").is_file() + assert (output_dir / "config.snapshot.json").is_file() + assert (output_dir / "run.log").is_file() + assert (output_dir / "baseline_prompts" / "instruction.md").is_file() + assert (output_dir / "best_prompts" / "instruction.md").is_file() + best_text = (output_dir / "best_prompts" / "instruction.md").read_text(encoding="utf-8") + assert best_text == "improved" + log_line = (output_dir / "run.log").read_text(encoding="utf-8") + assert "SUCCEEDED" in log_line + + +@pytest.mark.asyncio +async def test_facade_persists_artifacts_when_algorithm_fails(tmp_path, monkeypatch): + """Even when the algorithm returns a FAILED result the facade should + still leave baseline_prompts, config snapshot and run.log on disk so + debug context is preserved across runs.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + async def boom(self, **kwargs): + raise RuntimeError("evaluator timeout") + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", boom) + + output_dir = tmp_path / "runs" / "failure_check" + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + verbose=0, + ) + assert result.status == "FAILED" + assert "evaluator timeout" in result.error_message + assert (output_dir / "result.json").is_file() + assert (output_dir / "baseline_prompts" / "instruction.md").is_file() + assert (output_dir / "config.snapshot.json").is_file() + assert (output_dir / "run.log").is_file() + + +@pytest.mark.asyncio +async def test_facade_verbose_zero_emits_no_terminal_output( + tmp_path, monkeypatch, capsys +): + """verbose=0 must suppress every reporter event so the user can run the + optimizer inside batch pipelines without polluting downstream stdout.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "silent"), + verbose=0, + ) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + +# ----- A3-A6: optimizer startup-time input validation (spec §3.2) ----- + + +@pytest.mark.asyncio +async def test_facade_rejects_tool_trajectory_avg_score_metric(tmp_path): + """spec §3.2 / acceptance #12: tool_trajectory_avg_score requires session traces + so it is unusable in call_agent mode; reject at startup.""" + import json + payload = { + "evaluate": { + "metrics": [{"metric_name": "tool_trajectory_avg_score", "threshold": 0.8}], + "num_runs": 1, + }, + "optimize": { + "algorithm": { + "name": "gepa_reflective", + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 10, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "metric_check"), + verbose=0, + ) + assert "tool_trajectory_avg_score" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_facade_rejects_llm_rubric_knowledge_recall_metric(tmp_path): + """F-4: ``llm_rubric_knowledge_recall`` reads tool responses from + ``Invocation.intermediate_data``; ``RemoteEvalService`` always emits + ``intermediate_data=None`` so the judge would silently see "No + knowledge search results were found." for every case. Reject at + startup so users do not waste an optimization run on a metric that + can never produce a non-zero score in call_agent mode. + """ + import json + payload = { + "evaluate": { + "metrics": [{"metric_name": "llm_rubric_knowledge_recall", "threshold": 0.8}], + "num_runs": 1, + }, + "optimize": { + "algorithm": { + "name": "gepa_reflective", + "reflection_lm": { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + }, + "max_metric_calls": 10, + }, + }, + } + config_path = tmp_path / "optimizer.json" + config_path.write_text(json.dumps(payload), encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "metric_check_recall"), + verbose=0, + ) + assert "llm_rubric_knowledge_recall" in str(exc_info.value) + # Error message should hint at compatible alternatives so users can + # immediately switch instead of guessing. + assert "final_response_avg_score" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_facade_rejects_empty_target_prompt(tmp_path): + """spec §3.2: TargetPrompt with no registered fields is a usage error.""" + config_path = _write_config_file(tmp_path) + empty_target = TargetPrompt() + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=empty_target, + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "empty_target"), + verbose=0, + ) + assert "TargetPrompt" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_facade_rejects_non_async_call_agent(tmp_path): + """spec §3.2: call_agent must be async; reject sync functions at startup.""" + config_path = _write_config_file(tmp_path) + + def sync_call_agent(query: str) -> str: + return "stub" + + with pytest.raises(TypeError) as exc_info: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=sync_call_agent, # type: ignore[arg-type] + target_prompt=_new_target_prompt(), + train_dataset_path="/tmp/x.json", + validation_dataset_path="/tmp/y.json", + output_dir=str(tmp_path / "runs" / "sync_check"), + verbose=0, + ) + assert "async" in str(exc_info.value).lower() + + +@pytest.mark.asyncio +async def test_facade_rejects_same_train_and_validation_paths(tmp_path): + """spec §3.2: train and validation paths must be different to avoid train-test leakage.""" + config_path = _write_config_file(tmp_path) + same_path = tmp_path / "shared.evalset.json" + same_path.write_text("{}", encoding="utf-8") + + with pytest.raises(ValueError) as exc_info: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(same_path), + validation_dataset_path=str(same_path), + output_dir=str(tmp_path / "runs" / "leakage_check"), + verbose=0, + ) + assert "train" in str(exc_info.value).lower() or "leak" in str(exc_info.value).lower() + + +@pytest.mark.asyncio +async def test_facade_warns_when_use_merge_with_single_field(tmp_path): + """GEPA-3: gepa merge degenerates to picking one of two parents when only + a single component is registered. Surface a UserWarning so users don't + silently see merge_rounds_total=0.""" + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text("{}", encoding="utf-8") + val_path.write_text("{}", encoding="utf-8") + + config_path = _write_config_file(tmp_path, extra_algo={"use_merge": True}) + config = load_optimize_config(config_path) + + with pytest.warns(UserWarning, match="use_merge=true"): + AgentOptimizer._validate_inputs( + config=config, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), # single callback field + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "merge_warn"), + ) + + +@pytest.mark.asyncio +async def test_facade_no_warn_when_use_merge_with_two_fields(tmp_path): + """Multi-field config + use_merge=True: warning must NOT fire.""" + import warnings as _warnings + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text("{}", encoding="utf-8") + val_path.write_text("{}", encoding="utf-8") + + config_path = _write_config_file(tmp_path, extra_algo={"use_merge": True}) + config = load_optimize_config(config_path) + + target = TargetPrompt() + state_a: dict[str, str] = {} + state_b: dict[str, str] = {} + + async def read_a() -> str: + return state_a.get("v", "") + + async def write_a(v: str) -> None: + state_a["v"] = v + + async def read_b() -> str: + return state_b.get("v", "") + + async def write_b(v: str) -> None: + state_b["v"] = v + + target.add_callback("a", read=read_a, write=write_a) + target.add_callback("b", read=read_b, write=write_b) + + with _warnings.catch_warnings(): + _warnings.simplefilter("error", UserWarning) # any UserWarning fails the test + AgentOptimizer._validate_inputs( + config=config, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "merge_two_fields"), + ) + + +@pytest.mark.asyncio +async def test_facade_no_warn_when_use_merge_false_single_field(tmp_path): + """use_merge=false (default) + single field: warning must NOT fire.""" + import warnings as _warnings + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text("{}", encoding="utf-8") + val_path.write_text("{}", encoding="utf-8") + + config_path = _write_config_file(tmp_path) # default use_merge=False + config = load_optimize_config(config_path) + + with _warnings.catch_warnings(): + _warnings.simplefilter("error", UserWarning) + AgentOptimizer._validate_inputs( + config=config, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "no_merge"), + ) + + +@pytest.mark.asyncio +async def test_facade_restores_baseline_when_writeback_fails(tmp_path, monkeypatch): + """If update_source=True but writing the best candidate back fails, sources + must end up at the original baseline (not mid-run candidate) and the + write-back exception must surface to the caller.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {"instruction": "BASELINE"} + target = _new_target_prompt(recorder) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "MID_CANDIDATE"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + # Simulate gepa rewriting the source during a round. + recorder["instruction"] = "MID_CANDIDATE" + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + # Patch write_all to fail only when the best is about to be persisted. + original_write_all = target.write_all + call_count = {"n": 0} + + async def explosive_write_all(prompts): + call_count["n"] += 1 + if prompts.get("instruction") == "IMPROVED_BEST": + raise RuntimeError("disk full") + await original_write_all(prompts) + + # Make optimizer.run() set best_prompts to a distinct value the test can + # detect; rebuild fake gepa result. + fake_gepa_result.candidates = [{"instruction": "BASELINE"}, {"instruction": "IMPROVED_BEST"}] + target.write_all = explosive_write_all # type: ignore[assignment] + + with pytest.raises(RuntimeError, match="disk full"): + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "writeback_fail"), + update_source=True, + verbose=0, + ) + + assert recorder["instruction"] == "BASELINE", ( + "after a failed write-back the source must be restored to baseline, " + f"got {recorder['instruction']!r}" + ) + + +@pytest.mark.asyncio +async def test_facade_default_update_source_false_keeps_source_intact(tmp_path, monkeypatch): + """A2: default ``update_source=False`` MUST leave TargetPrompt source untouched.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {"instruction": "INITIAL"} + target = _new_target_prompt(recorder) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "INITIAL"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "default_keep"), + verbose=0, + ) + + assert result.best_prompts == {"instruction": "IMPROVED"} + assert recorder["instruction"] == "INITIAL", ( + "default update_source=False MUST NOT write the best candidate back to source" + ) + + +@pytest.mark.asyncio +async def test_facade_update_source_true_writes_best_back(tmp_path, monkeypatch): + """A2: explicit ``update_source=True`` writes the best candidate back to TargetPrompt.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + recorder: dict[str, str] = {"instruction": "INITIAL"} + target = _new_target_prompt(recorder) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "INITIAL"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "update_true"), + update_source=True, + verbose=0, + ) + + assert result.best_prompts == {"instruction": "IMPROVED"} + assert recorder["instruction"] == "IMPROVED" + + +@pytest.mark.asyncio +async def test_facade_accepts_train_and_validation_paths_differing_only_by_dot_slash(tmp_path): + """Resolve symlinks/relative prefixes so './x' and 'x' are detected as same file.""" + config_path = _write_config_file(tmp_path) + same_path = tmp_path / "shared.evalset.json" + same_path.write_text("{}", encoding="utf-8") + train_str = f"{same_path.parent}/./{same_path.name}" + + with pytest.raises(ValueError): + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=train_str, + validation_dataset_path=str(same_path), + output_dir=str(tmp_path / "runs" / "dotslash_check"), + verbose=0, + ) + + +@pytest.mark.asyncio +async def test_facade_forwards_extra_stop_and_gepa_callbacks(tmp_path, monkeypatch): + """AgentOptimizer.optimize must forward extra_stop/gepa_callbacks to the algorithm.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + def sentinel_stopper(gepa_state=None): + return False + + sentinel_callback = object() + captured: dict = {} + + async def _capture_run(self, *, reporter=None): + from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + captured["extra_stop"] = list(self.extra_stop_callbacks) + captured["extra_gepa"] = list(self.extra_gepa_callbacks) + return OptimizeResult( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + baseline_prompts={"instruction": "initial"}, + best_prompts={"instruction": "initial"}, + total_rounds=0, + rounds=[], + total_reflection_lm_calls=0, + total_judge_model_calls=0, + total_llm_cost=0.0, + duration_seconds=0.0, + started_at="2026-05-18T00:00:00+00:00", + finished_at="2026-05-18T00:00:00+00:00", + extras={}, + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "run", _capture_run) + + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "extras"), + extra_stop_callbacks=[sentinel_stopper], + extra_gepa_callbacks=[sentinel_callback], + verbose=0, + ) + + assert sentinel_stopper in captured["extra_stop"] + assert sentinel_callback in captured["extra_gepa"] + + +@pytest.mark.asyncio +async def test_facade_summary_txt_reflects_update_source_true(tmp_path, monkeypatch): + """DOC-1: summary.txt must reflect the actual update_source value used. + Previously _persist_artifacts hard-coded update_source=False so the file + contradicted the terminal banner whenever the user passed update_source=True.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + # Output dir intentionally lacks the substring "true" so the assertion + # below cannot accidentally match the path itself. + output_dir = tmp_path / "runs" / "us_check_a" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + update_source=True, + verbose=0, + ) + + summary_text = (output_dir / "summary.txt").read_text(encoding="utf-8") + # format_summary writes the exact line "update_source : true" / "false". + assert "update_source : true" in summary_text, ( + f"summary.txt should reflect update_source=True; got:\n{summary_text}" + ) + assert "update_source : false" not in summary_text + + +@pytest.mark.asyncio +async def test_facade_summary_txt_reflects_update_source_false(tmp_path, monkeypatch): + """Complement: when update_source=False (default), summary still reflects that.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + output_dir = tmp_path / "runs" / "us_check_b" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + update_source=False, + verbose=0, + ) + + summary_text = (output_dir / "summary.txt").read_text(encoding="utf-8") + assert "update_source : false" in summary_text + assert "update_source : true" not in summary_text + + +# --- FAIL-2: cleanup_done sentinel prevents double baseline write_all --- + +@pytest.mark.asyncio +async def test_facade_failed_writeback_invokes_baseline_callback_exactly_once( + tmp_path, monkeypatch +): + """FAIL-2: when write_all(best) raises, ``cleanup_done`` must guarantee the + ``except`` rollback restore_baseline call is NOT followed by a second + restore in ``finally``. + + Pre-fix code flipped ``writeback_succeeded`` only on the happy path, so + the failure path executed write_all(baseline) twice: once in ``except``, + once in ``finally``. Path-backed fields are idempotent (tmp + replace + is harmless), but callback-backed fields with non-idempotent + ``write_fn`` (audit logs, version counters) saw their hook fire twice + per failed update_source=True run. + """ + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + + # Spy on every write_fn call so we can count exactly how many times + # baseline is persisted after the best-write fails. + write_log: list[str] = [] + state: dict[str, str] = {"instruction": "BASELINE"} + + async def read_cb() -> str: + return state["instruction"] + + async def write_cb(value: str) -> None: + write_log.append(value) + if value == "IMPROVED_BEST": + raise RuntimeError("disk full while writing best candidate") + state["instruction"] = value + + target = TargetPrompt().add_callback( + "instruction", read=read_cb, write=write_cb + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED_BEST"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + with pytest.raises(RuntimeError, match="disk full"): + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "fail2_double_baseline"), + update_source=True, + verbose=0, + ) + + # Expected sequence: best attempt (fails) -> baseline restore (success). + # Pre-fix would have appended a second "BASELINE" from the finally block. + assert write_log == ["IMPROVED_BEST", "BASELINE"], ( + "baseline write_fn must be invoked exactly once after a failed " + f"update_source=True writeback; got {write_log!r}" + ) + assert state["instruction"] == "BASELINE" + + +@pytest.mark.asyncio +async def test_facade_success_path_does_not_re_restore_baseline( + tmp_path, monkeypatch +): + """FAIL-2 happy-path counterpart: when write_all(best) succeeds, the + ``finally`` block must NOT re-write baseline either. + + Pre-fix code was also wrong here in a milder way: if ``writeback_succeeded`` + was False at finally entry the restore fired. The flag flipped on + success so the bug did not manifest on the happy path, but this test + pins the invariant explicitly so a future refactor cannot reintroduce + a double-write.""" + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + + write_log: list[str] = [] + state: dict[str, str] = {"instruction": "BASELINE"} + + async def read_cb() -> str: + return state["instruction"] + + async def write_cb(value: str) -> None: + write_log.append(value) + state["instruction"] = value + + target = TargetPrompt().add_callback( + "instruction", read=read_cb, write=write_cb + ) + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "fail2_happy"), + update_source=True, + verbose=0, + ) + + # Only one call: the successful best writeback. No baseline restore. + assert write_log == ["IMPROVED"], ( + "happy-path update_source=True must invoke write_fn exactly once " + f"(best); got {write_log!r}" + ) + assert state["instruction"] == "IMPROVED" + + +# --- FAIL-1: atomic artifact persistence + SIGINT mask ------------------- + +def test_atomic_write_text_no_partial_file_on_failure(tmp_path): + """FAIL-1: ``_atomic_write_text`` must never leave a half-written file. + + If the write step crashes (simulated by a write_text mock that raises), + the destination path either does not exist (first run) or holds its + pre-call content untouched — never a partial write.""" + from trpc_agent_sdk.evaluation._agent_optimizer import _atomic_write_text + + target = tmp_path / "result.json" + target.write_text("ORIGINAL", encoding="utf-8") + + # Simulate failure between tmp write and os.replace by writing to a + # path whose parent does not exist. + bad_path = tmp_path / "no_such_dir" / "result.json" + with pytest.raises(FileNotFoundError): + _atomic_write_text(str(bad_path), "PARTIAL_CONTENT") + + # The original target is untouched. + assert target.read_text(encoding="utf-8") == "ORIGINAL" + # No .tmp leaked at the bad path's parent (parent missing, nothing to clean). + assert not bad_path.exists() + + +def test_atomic_write_text_replaces_existing_file(tmp_path): + """FAIL-1: atomic write must fully replace any pre-existing content.""" + from trpc_agent_sdk.evaluation._agent_optimizer import _atomic_write_text + + target = tmp_path / "out.txt" + target.write_text("OLD", encoding="utf-8") + _atomic_write_text(str(target), "NEW") + assert target.read_text(encoding="utf-8") == "NEW" + assert not (tmp_path / "out.txt.tmp").exists() + + +def test_mask_sigint_restores_previous_handler(): + """FAIL-1: ``_mask_sigint`` must restore the original SIGINT handler on exit, + even if the wrapped block raises.""" + import signal as _signal + + from trpc_agent_sdk.evaluation._agent_optimizer import _mask_sigint + + original = _signal.getsignal(_signal.SIGINT) + try: + sentinel_called = [] + + def _sentinel(signum, frame): # pragma: no cover + sentinel_called.append(signum) + + _signal.signal(_signal.SIGINT, _sentinel) + try: + with _mask_sigint(): + # While masked, the handler is SIG_IGN, not _sentinel. + assert _signal.getsignal(_signal.SIGINT) == _signal.SIG_IGN + # On exit, _sentinel is restored. + assert _signal.getsignal(_signal.SIGINT) is _sentinel + + # Raising inside the block still restores. + with pytest.raises(RuntimeError): + with _mask_sigint(): + assert _signal.getsignal(_signal.SIGINT) == _signal.SIG_IGN + raise RuntimeError("boom") + assert _signal.getsignal(_signal.SIGINT) is _sentinel + finally: + _signal.signal(_signal.SIGINT, original) + finally: + # Belt-and-suspenders restore so a test crash cannot leave the + # interpreter in a weird state for sibling tests. + _signal.signal(_signal.SIGINT, original) + + +def test_mask_sigint_no_op_off_main_thread(): + """FAIL-1: ``_mask_sigint`` must degrade to a no-op when invoked from a + non-main thread (``signal.signal`` raises ValueError there). + + The artifact persistence path runs in whatever event-loop thread the + caller picked; we still want it to complete cleanly even if SIGINT + masking isn't available.""" + import threading + + from trpc_agent_sdk.evaluation._agent_optimizer import _mask_sigint + + errors: list[BaseException] = [] + + def _runner() -> None: + try: + with _mask_sigint(): + pass + except BaseException as exc: # pragma: no cover - guard + errors.append(exc) + + t = threading.Thread(target=_runner) + t.start() + t.join() + assert errors == [] + + +@pytest.mark.asyncio +async def test_persist_artifacts_uses_atomic_write_for_summary_and_result( + tmp_path, monkeypatch +): + """FAIL-1: ``_persist_artifacts`` must route every artifact write through + ``os.replace`` (the tmp+rename atomic primitive). Spy ``os.replace`` and + confirm result.json, summary.txt, round_.json, run.log, baseline + prompts and best prompts all show up as replace targets.""" + import os as _os + + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + replaced: list[str] = [] + real_replace = _os.replace + + def _spy_replace(src, dst): + replaced.append(str(dst)) + return real_replace(src, dst) + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._agent_optimizer.os.replace", _spy_replace + ) + + output_dir = tmp_path / "runs" / "fail1_atomic" + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(output_dir), + verbose=0, + ) + + # Every persisted artifact must have gone through atomic rename. + replaced_names = {_os.path.basename(p) for p in replaced} + assert "result.json" in replaced_names + assert "summary.txt" in replaced_names + assert "run.log" in replaced_names + assert "config.snapshot.json" in replaced_names + # At least one round file and one baseline / best prompt. + assert any(n.startswith("round_") and n.endswith(".json") for n in replaced_names) + # No leftover .tmp files in output_dir tree. + leftover_tmps = list(output_dir.rglob("*.tmp")) + assert leftover_tmps == [], f"unexpected .tmp residue: {leftover_tmps}" + + +@pytest.mark.asyncio +async def test_persist_artifacts_masks_sigint_during_writes( + tmp_path, monkeypatch +): + """FAIL-1: while ``_persist_artifacts`` runs, SIGINT must be masked so a + panicked second Ctrl+C during teardown cannot interrupt artifact + writes mid-os.replace. We verify by checking ``signal.getsignal`` from + inside a spied-on artifact write.""" + import signal as _signal + + train = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train.model_dump_json(), encoding="utf-8") + val_path.write_text(val.model_dump_json(), encoding="utf-8") + config_path = _write_config_file(tmp_path) + target = _new_target_prompt() + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "BASELINE"}, {"instruction": "IMPROVED"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + sigint_state_during_persist: list = [] + real_replace = __import__("os").replace + + def _spy_replace(src, dst): + sigint_state_during_persist.append(_signal.getsignal(_signal.SIGINT)) + return real_replace(src, dst) + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._agent_optimizer.os.replace", _spy_replace + ) + + original = _signal.getsignal(_signal.SIGINT) + try: + await AgentOptimizer.optimize( + config_path=config_path, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs" / "fail1_sigint"), + verbose=0, + ) + finally: + # Belt-and-suspenders restore in case the mask didn't unwind correctly. + _signal.signal(_signal.SIGINT, original) + + # Every replace observed during persistence saw SIGINT == SIG_IGN. + assert sigint_state_during_persist, "expected at least one artifact write" + assert all( + state == _signal.SIG_IGN for state in sigint_state_during_persist + ), ( + "SIGINT must be masked during artifact persistence; observed handlers: " + f"{sigint_state_during_persist!r}" + ) + + # After optimize returns, the prior handler is restored. + assert _signal.getsignal(_signal.SIGINT) is original diff --git a/tests/evaluation/test_base_optimizer.py b/tests/evaluation/test_base_optimizer.py new file mode 100644 index 0000000..aaab841 --- /dev/null +++ b/tests/evaluation/test_base_optimizer.py @@ -0,0 +1,240 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for BaseOptimizer abstract interface.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._eval_callbacks import Callbacks +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + +def _dummy_result() -> OptimizeResult: + return OptimizeResult( + algorithm="stub", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + total_rounds=0, + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="1970-01-01T00:00:00Z", + finished_at="1970-01-01T00:00:00Z", + ) + + +def _make_config() -> OptimizeConfigFile: + return OptimizeConfigFile.model_validate( + { + "evaluate": {"metrics": [{"metric_name": "x", "threshold": 0.7}]}, + "optimize": { + "algorithm": { + "name": "gepa_reflective", + "reflection_lm": OptimizeModelOptions( + model_name="m", api_key="k" + ).model_dump(), + "max_metric_calls": 10, + } + }, + } + ) + + +async def _noop_call_agent(query: str) -> str: + return "" + + +class _StubOptimizer(BaseOptimizer): + async def run(self) -> OptimizeResult: + return _dummy_result() + + +class _IncompleteOptimizer(BaseOptimizer): + """Subclass without implementing run().""" + + +def test_base_optimizer_cannot_instantiate_directly(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + with pytest.raises(TypeError): + BaseOptimizer( + config=_make_config(), + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "train.json"), + validation_dataset_path=str(tmp_path / "val.json"), + ) + + +def test_base_optimizer_subclass_without_run_cannot_instantiate(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + with pytest.raises(TypeError): + _IncompleteOptimizer( + config=_make_config(), + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "train.json"), + validation_dataset_path=str(tmp_path / "val.json"), + ) + + +def test_base_optimizer_stores_constructor_arguments(tmp_path): + seed_path = _seed_prompt(tmp_path) + target_prompt = TargetPrompt().add_path("system_prompt", str(seed_path)) + config = _make_config() + train_path = str(tmp_path / "train.json") + val_path = str(tmp_path / "val.json") + callbacks = Callbacks() + + optimizer = _StubOptimizer( + config=config, + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=train_path, + validation_dataset_path=val_path, + callbacks=callbacks, + ) + + assert optimizer.config is config + assert optimizer.call_agent is _noop_call_agent + assert optimizer.target_prompt is target_prompt + assert optimizer.train_dataset_path == train_path + assert optimizer.validation_dataset_path == val_path + assert optimizer.callbacks is callbacks + + +def test_base_optimizer_callbacks_default_to_none(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + optimizer = _StubOptimizer( + config=_make_config(), + call_agent=_noop_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "train.json"), + validation_dataset_path=str(tmp_path / "val.json"), + ) + assert optimizer.callbacks is None + + +def test_base_optimizer_rejects_positional_arguments(tmp_path): + target_prompt = TargetPrompt().add_path("system_prompt", str(_seed_prompt(tmp_path))) + with pytest.raises(TypeError): + _StubOptimizer( + _make_config(), + _noop_call_agent, + target_prompt, + str(tmp_path / "train.json"), + str(tmp_path / "val.json"), + ) + + +async def test_base_optimizer_run_is_async(): + import inspect + + assert inspect.iscoroutinefunction(BaseOptimizer.run) + + +def _seed_prompt(tmp_path): + seed = tmp_path / "system.md" + seed.write_text("you are a helpful assistant", encoding="utf-8") + return seed + + +# --------------------------------------------------------------------------- +# BaseOptimizer.resolve_required_thresholds +# --------------------------------------------------------------------------- + + +def _stop_cfg(required_metrics): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + return FrameworkStopConfig(required_metrics=required_metrics) + + +def test_resolve_required_thresholds_all_returns_full_dict(): + thresholds = {"m1": 0.5, "m2": 0.3} + assert ( + BaseOptimizer.resolve_required_thresholds(_stop_cfg("all"), thresholds) + == thresholds + ) + + +def test_resolve_required_thresholds_list_returns_subset(): + thresholds = {"m1": 0.5, "m2": 0.3, "m3": 0.9} + assert BaseOptimizer.resolve_required_thresholds( + _stop_cfg(["m1", "m3"]), thresholds + ) == {"m1": 0.5, "m3": 0.9} + + +def test_resolve_required_thresholds_none_returns_empty(): + assert ( + BaseOptimizer.resolve_required_thresholds(_stop_cfg(None), {"m1": 0.5}) + == {} + ) + + +def test_resolve_required_thresholds_empty_list_returns_empty(): + assert ( + BaseOptimizer.resolve_required_thresholds(_stop_cfg([]), {"m1": 0.5}) + == {} + ) + + +def test_resolve_required_thresholds_list_silently_drops_unknown_names(): + thresholds = {"m1": 0.5} + assert BaseOptimizer.resolve_required_thresholds( + _stop_cfg(["m1", "missing"]), thresholds + ) == {"m1": 0.5} + + +def test_resolve_required_thresholds_returns_copy_not_alias(): + thresholds = {"m1": 0.5} + out = BaseOptimizer.resolve_required_thresholds(_stop_cfg("all"), thresholds) + out["m1"] = 9.9 + assert thresholds["m1"] == 0.5 + + +# --------------------------------------------------------------------------- +# BaseOptimizer.metrics_meet_thresholds +# --------------------------------------------------------------------------- + + +def test_metrics_meet_thresholds_empty_required_returns_false(): + assert BaseOptimizer.metrics_meet_thresholds({"m1": 1.0}, {}) is False + + +def test_metrics_meet_thresholds_all_above_returns_true(): + assert ( + BaseOptimizer.metrics_meet_thresholds( + {"m1": 0.6, "m2": 0.4}, {"m1": 0.5, "m2": 0.3} + ) + is True + ) + + +def test_metrics_meet_thresholds_one_below_returns_false(): + assert ( + BaseOptimizer.metrics_meet_thresholds( + {"m1": 0.6, "m2": 0.2}, {"m1": 0.5, "m2": 0.3} + ) + is False + ) + + +def test_metrics_meet_thresholds_exact_match_returns_true(): + assert BaseOptimizer.metrics_meet_thresholds({"m1": 0.5}, {"m1": 0.5}) is True + + +def test_metrics_meet_thresholds_missing_breakdown_key_returns_false(): + assert BaseOptimizer.metrics_meet_thresholds({"m2": 0.9}, {"m1": 0.5}) is False diff --git a/tests/evaluation/test_optimize_config.py b/tests/evaluation/test_optimize_config.py new file mode 100644 index 0000000..f722a01 --- /dev/null +++ b/tests/evaluation/test_optimize_config.py @@ -0,0 +1,629 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeConfigFile and discriminated algorithm union.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfig +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + +_VALID_REFLECTION_LM = { + "model_name": "gpt-4o", + "api_key": "opt-key", + "base_url": "https://api.example.com", + "generation_config": {"temperature": 0.2}, +} + + +def _evaluate_section_dict() -> dict: + return { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 0.7, + "criterion": {"finalResponse": {}}, + } + ], + "num_runs": 2, + } + + +def _gepa_algorithm_dict() -> dict: + return { + "name": "gepa_reflective", + "reflection_lm": _VALID_REFLECTION_LM, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "use_merge": False, + "max_merge_invocations": 5, + "skip_perfect_score": True, + "max_metric_calls": 50, + } + + +def _full_config_dict_gepa() -> dict: + return { + "evaluate": _evaluate_section_dict(), + "optimize": { + "eval_case_parallelism": 8, + "algorithm": {**_gepa_algorithm_dict(), "seed": 7}, + }, + } + + +def test_evaluate_section_is_plain_eval_config(): + payload = { + "evaluate": {"metrics": [{"metric_name": "x", "threshold": 0.7}], "num_runs": 1}, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + cfg = OptimizeConfigFile.model_validate(payload) + assert type(cfg.evaluate) is EvalConfig + assert cfg.evaluate.num_runs == 1 + metrics = cfg.evaluate.get_eval_metrics() + assert len(metrics) == 1 + assert metrics[0].metric_name == "x" + + +def test_evaluate_section_rejects_unknown_field_via_eval_config_forbid(): + payload = { + "evaluate": { + "metrics": [{"metric_name": "x", "threshold": 0.7}], + "train_dataset_path": "unsupported", + }, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + with pytest.raises(ValidationError): + OptimizeConfigFile.model_validate(payload) + + +def test_gepa_reflective_algo_minimal_required_fields(): + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="gpt-4o", api_key="k"), + max_metric_calls=10, + ) + assert algo.name == "gepa_reflective" + assert algo.reflection_lm.model_name == "gpt-4o" + assert algo.seed == 42 + assert algo.candidate_selection_strategy == "pareto" + assert algo.module_selector == "round_robin" + assert algo.frontier_type == "instance" + assert algo.use_merge is False + assert algo.max_merge_invocations == 5 + assert algo.merge_val_overlap_floor == 5 + assert algo.skip_perfect_score is True + assert algo.perfect_score == 1.0 + assert algo.cache_evaluation is False + assert algo.track_best_outputs is False + assert algo.reflection_minibatch_size is None + assert algo.max_metric_calls == 10 + assert algo.max_iterations_without_improvement is None + assert algo.timeout_seconds is None + assert algo.score_threshold is None + assert algo.max_candidate_proposals is None + assert algo.max_tracked_candidates is None + + +def test_gepa_reflective_algo_rejects_unknown_field(): + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + typo_field=1, + ) + + +def test_gepa_reflective_algo_rejects_illegal_selection_strategy(): + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + candidate_selection_strategy="bogus", + ) + + +def test_gepa_reflective_algo_rejects_illegal_frontier_type(): + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + frontier_type="something_else", + ) + + +def test_gepa_reflective_algo_requires_at_least_one_stop_condition(): + with pytest.raises(ValidationError) as exc_info: + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + ) + assert "stop condition" in str(exc_info.value).lower() + + +@pytest.mark.parametrize( + "stop_field,stop_value", + [ + ("max_iterations_without_improvement", 3), + ("timeout_seconds", 10.0), + ("score_threshold", 0.95), + ("max_candidate_proposals", 25), + ("max_tracked_candidates", 32), + ], +) +def test_gepa_reflective_algo_accepts_any_single_stop_condition(stop_field, stop_value): + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + **{stop_field: stop_value}, + ) + assert getattr(algo, stop_field) == stop_value + assert algo.max_metric_calls is None + + +def test_optimize_config_requires_algorithm(): + with pytest.raises(ValidationError): + OptimizeConfig() + + +def test_optimize_config_routes_to_gepa_reflective(): + cfg = OptimizeConfig(algorithm=_gepa_algorithm_dict()) + assert isinstance(cfg.algorithm, GepaReflectiveAlgo) + assert cfg.algorithm.name == "gepa_reflective" + + +def test_optimize_config_rejects_unknown_algorithm_name(): + with pytest.raises(ValidationError) as exc_info: + OptimizeConfig( + algorithm={ + "name": "unknown_algo", + "reflection_lm": _VALID_REFLECTION_LM, + "max_metric_calls": 10, + } + ) + assert "unknown_algo" in str(exc_info.value) or "tag" in str(exc_info.value).lower() + + +def test_optimize_config_rejects_missing_algorithm_name(): + with pytest.raises(ValidationError): + OptimizeConfig(algorithm={"reflection_lm": _VALID_REFLECTION_LM}) + + +def test_optimize_config_rejects_unknown_top_level_field(): + with pytest.raises(ValidationError): + OptimizeConfig(algorithm=_gepa_algorithm_dict(), unknown_field="boom") + + +def test_optimize_config_seed_only_lives_under_algorithm(): + with pytest.raises(ValidationError): + OptimizeConfig.model_validate( + {"seed": 9, "algorithm": _gepa_algorithm_dict()} + ) + + cfg = OptimizeConfig.model_validate( + {"algorithm": {**_gepa_algorithm_dict(), "seed": 9}} + ) + assert isinstance(cfg.algorithm, GepaReflectiveAlgo) + assert cfg.algorithm.seed == 9 + + +def test_optimize_config_file_requires_both_sections(): + with pytest.raises(ValidationError): + OptimizeConfigFile() + with pytest.raises(ValidationError): + OptimizeConfigFile(optimize=OptimizeConfig(algorithm=_gepa_algorithm_dict())) + + +def test_optimize_config_file_rejects_unknown_top_level_field(): + with pytest.raises(ValidationError): + OptimizeConfigFile.model_validate( + { + "evaluate": _evaluate_section_dict(), + "optimize": {"algorithm": _gepa_algorithm_dict()}, + "unknown_extra": 1, + } + ) + + +def test_load_optimize_config_gepa_round_trip(tmp_path: Path): + cfg_path = tmp_path / "opt.json" + cfg_path.write_text(json.dumps(_full_config_dict_gepa()), encoding="utf-8") + + cfg = load_optimize_config(str(cfg_path)) + + assert isinstance(cfg, OptimizeConfigFile) + assert cfg.evaluate.num_runs == 2 + + metrics = cfg.evaluate.get_eval_metrics() + assert len(metrics) == 1 + assert metrics[0].metric_name == "final_response_avg_score" + + opt = cfg.optimize + assert opt.eval_case_parallelism == 8 + + assert isinstance(opt.algorithm, GepaReflectiveAlgo) + assert opt.algorithm.reflection_lm.model_name == "gpt-4o" + assert opt.algorithm.candidate_selection_strategy == "pareto" + assert opt.algorithm.module_selector == "round_robin" + assert opt.algorithm.seed == 7 + assert opt.algorithm.max_metric_calls == 50 + + +def test_load_optimize_config_missing_evaluate_section_raises(tmp_path: Path): + cfg_path = tmp_path / "no_evaluate.json" + cfg_path.write_text( + json.dumps({"optimize": {"algorithm": _gepa_algorithm_dict()}}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_missing_optimize_section_raises(tmp_path: Path): + cfg_path = tmp_path / "no_optimize.json" + cfg_path.write_text( + json.dumps({"evaluate": _evaluate_section_dict()}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_missing_algorithm_raises(tmp_path: Path): + cfg_path = tmp_path / "no_algo.json" + cfg_path.write_text( + json.dumps({"evaluate": _evaluate_section_dict(), "optimize": {}}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_typo_in_evaluate_section_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "typo_eval.json" + cfg_path.write_text( + json.dumps( + { + "evaluate": { + "mertics": [{"metric_name": "x", "threshold": 0.7}], + "num_runs": 1, + }, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + ), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_typo_in_optimize_section_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "typo_opt.json" + cfg_path.write_text( + json.dumps( + { + "evaluate": _evaluate_section_dict(), + "optimize": { + "maxRoundds": 5, + "algorithm": _gepa_algorithm_dict(), + }, + } + ), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_typo_in_algorithm_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "typo_algo.json" + bad_algo = _gepa_algorithm_dict() + bad_algo["max_metricc_calls"] = 100 + cfg_path.write_text( + json.dumps({"evaluate": _evaluate_section_dict(), "optimize": {"algorithm": bad_algo}}), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_unknown_algorithm_name_fails_fast(tmp_path: Path): + cfg_path = tmp_path / "unknown_algo.json" + cfg_path.write_text( + json.dumps( + { + "evaluate": _evaluate_section_dict(), + "optimize": { + "algorithm": { + "name": "few_shot_bayesian", + "reflection_lm": _VALID_REFLECTION_LM, + "max_metric_calls": 10, + } + }, + } + ), + encoding="utf-8", + ) + with pytest.raises(ValidationError): + load_optimize_config(str(cfg_path)) + + +def test_load_optimize_config_missing_file_raises(tmp_path: Path): + with pytest.raises(FileNotFoundError): + load_optimize_config(str(tmp_path / "does_not_exist.json")) + + +def test_load_optimize_config_camel_case_keys_accepted(tmp_path: Path): + cfg_path = tmp_path / "camel.json" + payload = { + "evaluate": { + "metrics": [{"metricName": "x", "threshold": 0.6}], + "numRuns": 3, + }, + "optimize": { + "evalCaseParallelism": 5, + "algorithm": { + "name": "gepa_reflective", + "reflectionLm": {"modelName": "claude-3.5-sonnet", "apiKey": "k"}, + "candidateSelectionStrategy": "current_best", + "moduleSelector": "all", + "useMerge": True, + "maxMergeInvocations": 7, + "skipPerfectScore": False, + "maxMetricCalls": 30, + "maxIterationsWithoutImprovement": 2, + }, + }, + } + cfg_path.write_text(json.dumps(payload), encoding="utf-8") + cfg = load_optimize_config(str(cfg_path)) + assert cfg.evaluate.num_runs == 3 + assert cfg.optimize.eval_case_parallelism == 5 + assert isinstance(cfg.optimize.algorithm, GepaReflectiveAlgo) + algo = cfg.optimize.algorithm + assert algo.reflection_lm.model_name == "claude-3.5-sonnet" + assert algo.candidate_selection_strategy == "current_best" + assert algo.module_selector == "all" + assert algo.use_merge is True + assert algo.max_merge_invocations == 7 + assert algo.skip_perfect_score is False + assert algo.max_metric_calls == 30 + assert algo.max_iterations_without_improvement == 2 + + +def test_loaded_metrics_consumable_by_evaluator(tmp_path: Path): + from trpc_agent_sdk.evaluation import EvalMetric + + cfg_path = tmp_path / "opt.json" + cfg_path.write_text(json.dumps(_full_config_dict_gepa()), encoding="utf-8") + cfg = load_optimize_config(str(cfg_path)) + metrics = cfg.evaluate.get_eval_metrics() + for metric in metrics: + assert isinstance(metric, EvalMetric) + + +# --------------------------------------------------------------------------- +# FrameworkStopConfig +# --------------------------------------------------------------------------- + + +def test_framework_stop_config_default_required_metrics_is_all(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig() + assert cfg.required_metrics == "all" + + +def test_framework_stop_config_accepts_metric_list(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig(required_metrics=["m1", "m2"]) + assert cfg.required_metrics == ["m1", "m2"] + + +def test_framework_stop_config_accepts_none_to_disable(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig(required_metrics=None) + assert cfg.required_metrics is None + + +def test_framework_stop_config_accepts_empty_list_to_disable(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + cfg = FrameworkStopConfig(required_metrics=[]) + assert cfg.required_metrics == [] + + +def test_framework_stop_config_rejects_invalid_string(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + with pytest.raises(ValidationError): + FrameworkStopConfig(required_metrics="not-all") + + +def test_framework_stop_config_rejects_unknown_field(): + from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig + + with pytest.raises(ValidationError): + FrameworkStopConfig(required_metrics="all", typo_field=True) + + +# --------------------------------------------------------------------------- +# OptimizeConfig.stop wiring +# --------------------------------------------------------------------------- + + +def test_optimize_config_stop_defaults_to_required_metrics_all(): + cfg = OptimizeConfig(algorithm=_gepa_algorithm_dict()) + assert cfg.stop.required_metrics == "all" + + +def test_optimize_config_stop_explicit_list(): + cfg = OptimizeConfig.model_validate( + {"algorithm": _gepa_algorithm_dict(), "stop": {"required_metrics": ["m1"]}} + ) + assert cfg.stop.required_metrics == ["m1"] + + +def test_optimize_config_top_level_fields(): + cfg = OptimizeConfig(algorithm=_gepa_algorithm_dict()) + assert cfg.eval_case_parallelism == 4 + assert set(OptimizeConfig.model_fields.keys()) == { + "eval_case_parallelism", + "stop", + "algorithm", + } + + +# --------------------------------------------------------------------------- +# OptimizeConfigFile cross-field validator +# --------------------------------------------------------------------------- + + +def test_optimize_config_file_cross_field_rejects_unknown_required_metric(): + with pytest.raises(ValidationError) as exc_info: + OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [ + {"metric_name": "m1", "threshold": 0.5}, + ], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": ["m1", "bogus"]}, + }, + } + ) + assert "bogus" in str(exc_info.value) + + +def test_optimize_config_file_cross_field_accepts_known_required_metrics(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [ + {"metric_name": "m1", "threshold": 0.5}, + {"metric_name": "m2", "threshold": 0.3}, + ], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": ["m1"]}, + }, + } + ) + assert cfg.optimize.stop.required_metrics == ["m1"] + + +def test_optimize_config_file_cross_field_skipped_when_required_metrics_is_all(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [{"metric_name": "m1", "threshold": 0.5}], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": "all"}, + }, + } + ) + assert cfg.optimize.stop.required_metrics == "all" + + +def test_optimize_config_file_cross_field_skipped_when_required_metrics_is_none(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": { + "metrics": [{"metric_name": "m1", "threshold": 0.5}], + }, + "optimize": { + "algorithm": _gepa_algorithm_dict(), + "stop": {"required_metrics": None}, + }, + } + ) + assert cfg.optimize.stop.required_metrics is None + + +def test_optimize_config_file_no_stop_block_defaults_to_all(): + cfg = OptimizeConfigFile.model_validate( + { + "evaluate": {"metrics": [{"metric_name": "m1", "threshold": 0.5}]}, + "optimize": {"algorithm": _gepa_algorithm_dict()}, + } + ) + assert cfg.optimize.stop.required_metrics == "all" + + +def test_load_optimize_config_with_stop_block_round_trip(tmp_path: Path): + payload = _full_config_dict_gepa() + payload["optimize"]["stop"] = { + "required_metrics": ["final_response_avg_score"] + } + cfg_path = tmp_path / "with_stop.json" + cfg_path.write_text(json.dumps(payload), encoding="utf-8") + cfg = load_optimize_config(str(cfg_path)) + assert cfg.optimize.stop.required_metrics == ["final_response_avg_score"] + + +def test_gepa_reflective_algo_reflection_history_top_k_default_is_two() -> None: + from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo + from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ) + + assert algo.reflection_history_top_k == 2 + + +def test_gepa_reflective_algo_reflection_history_top_k_can_be_zero() -> None: + """K=0 disables the feature.""" + from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo + from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + reflection_history_top_k=0, + ) + + assert algo.reflection_history_top_k == 0 + + +def test_gepa_reflective_algo_reflection_history_top_k_rejects_six() -> None: + """Cap at 5 to bound prompt-token blow-up.""" + import pytest + from pydantic import ValidationError + + from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo + from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + with pytest.raises(ValidationError): + GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + reflection_history_top_k=6, + ) diff --git a/tests/evaluation/test_optimize_evaluator_call.py b/tests/evaluation/test_optimize_evaluator_call.py new file mode 100644 index 0000000..8dacb9a --- /dev/null +++ b/tests/evaluation/test_optimize_evaluator_call.py @@ -0,0 +1,613 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for the optimizer-facing evaluator call wrapper.""" + +from __future__ import annotations + +import pytest + +from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus +from trpc_agent_sdk.evaluation._eval_result import EvalCaseResult +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResult +from trpc_agent_sdk.evaluation._eval_result import EvalSetAggregateResult +from trpc_agent_sdk.evaluation._eval_result import EvaluateResult +from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome +from trpc_agent_sdk.evaluation._optimize_evaluator_call import run_evaluator +from trpc_agent_sdk.evaluation._optimize_evaluator_call import summarize_outcome + + +def _metric(name: str, score: float, status: EvalStatus = EvalStatus.PASSED) -> EvalMetricResult: + return EvalMetricResult( + metric_name=name, + threshold=0.5, + score=score, + eval_status=status, + ) + + +def _case( + eval_id: str, + final_status: EvalStatus, + metric_scores: dict[str, tuple[float, EvalStatus]], +) -> EvalCaseResult: + metrics = [_metric(n, s, st) for n, (s, st) in metric_scores.items()] + return EvalCaseResult( + eval_set_id="s1", + eval_id=eval_id, + final_eval_status=final_status, + overall_eval_metric_results=metrics, + eval_metric_result_per_invocation=[], + session_id=f"sess-{eval_id}", + ) + + +def _result(cases: list[EvalCaseResult], num_runs: int = 1) -> EvaluateResult: + by_id: dict[str, list[EvalCaseResult]] = {} + for c in cases: + by_id.setdefault(c.eval_id, []).append(c) + return EvaluateResult( + results_by_eval_set_id={ + "s1": EvalSetAggregateResult( + eval_results_by_eval_id=by_id, + num_runs=num_runs, + ), + } + ) + + +def test_summarize_outcome_all_passed_pass_rate_one(): + result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.PASSED, {"m": (0.95, EvalStatus.PASSED)}), + ]) + outcome = summarize_outcome(result) + assert outcome.pass_rate == 1.0 + assert outcome.failed_case_ids == [] + assert pytest.approx(outcome.tiebreaker) == (0.9 + 0.95) / 2 + + +def test_summarize_outcome_partial_pass_rate(): + result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.FAILED, {"m": (0.3, EvalStatus.FAILED)}), + _case("c3", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}), + _case("c4", EvalStatus.PASSED, {"m": (0.8, EvalStatus.PASSED)}), + ]) + outcome = summarize_outcome(result) + assert outcome.pass_rate == 0.5 + assert set(outcome.failed_case_ids) == {"c2", "c3"} + + +def test_summarize_outcome_empty_result_zero_pass_rate(): + outcome = summarize_outcome(EvaluateResult()) + assert outcome.pass_rate == 0.0 + assert outcome.tiebreaker == 0.0 + assert outcome.failed_case_ids == [] + assert outcome.metric_breakdown == {} + + +def test_summarize_outcome_metric_breakdown_averages_scores(): + result = _result([ + _case("c1", EvalStatus.PASSED, { + "metric_a": (0.8, EvalStatus.PASSED), + "metric_b": (0.6, EvalStatus.PASSED), + }), + _case("c2", EvalStatus.PASSED, { + "metric_a": (0.6, EvalStatus.PASSED), + "metric_b": (0.4, EvalStatus.PASSED), + }), + ]) + outcome = summarize_outcome(result) + assert pytest.approx(outcome.metric_breakdown["metric_a"]) == 0.7 + assert pytest.approx(outcome.metric_breakdown["metric_b"]) == 0.5 + + +def test_summarize_outcome_tiebreaker_is_mean_of_all_scores(): + result = _result([ + _case("c1", EvalStatus.PASSED, { + "metric_a": (1.0, EvalStatus.PASSED), + "metric_b": (0.0, EvalStatus.PASSED), + }), + ]) + outcome = summarize_outcome(result) + assert pytest.approx(outcome.tiebreaker) == 0.5 + + +def test_summarize_outcome_skips_none_scores(): + case = EvalCaseResult( + eval_set_id="s1", + eval_id="c1", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult(metric_name="m", threshold=0.5, score=None, + eval_status=EvalStatus.NOT_EVALUATED), + EvalMetricResult(metric_name="m2", threshold=0.5, score=0.9, + eval_status=EvalStatus.PASSED), + ], + eval_metric_result_per_invocation=[], + session_id="x", + ) + outcome = summarize_outcome(_result([case])) + assert outcome.metric_breakdown == {"m2": 0.9} + assert pytest.approx(outcome.tiebreaker) == 0.9 + + +def test_summarize_outcome_multi_run_repeats_failed_id(): + failing = _case("c1", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}) + passing = _case("c2", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}) + result = EvaluateResult( + results_by_eval_set_id={ + "s1": EvalSetAggregateResult( + eval_results_by_eval_id={ + "c1": [failing, failing], + "c2": [passing, passing], + }, + num_runs=2, + ), + } + ) + outcome = summarize_outcome(result) + assert outcome.pass_rate == 0.5 + assert outcome.failed_case_ids.count("c1") == 2 + + +def test_evaluation_outcome_is_immutable(): + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.6, + metric_breakdown={"m": 0.5}, + failed_case_ids=["c1"], + judge_model_calls=0, + raw_result=EvaluateResult(), + ) + try: + outcome.pass_rate = 1.0 # type: ignore[misc] + except Exception: + return + raise AssertionError("EvaluationOutcome should be frozen") + + +class _FakeExecuter: + def __init__(self, result: EvaluateResult) -> None: + self._result = result + self.evaluate_called = 0 + + async def evaluate(self) -> None: + self.evaluate_called += 1 + + def get_result(self) -> EvaluateResult: + return self._result + + +@pytest.mark.asyncio +async def test_run_evaluator_passes_through_call_agent_callbacks_num_runs(monkeypatch): + captured: dict = {} + fake_result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}), + ]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured["eval_dataset_path"] = eval_dataset_file_path_or_dir + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + sentinel_callbacks = object() + + outcome = await run_evaluator( + eval_dataset_path="/tmp/some_set.evalset.json", + eval_metrics_path="/tmp/metrics.json", + call_agent=call_agent, + callbacks=sentinel_callbacks, # type: ignore[arg-type] + num_runs=3, + ) + + assert captured["eval_dataset_path"] == "/tmp/some_set.evalset.json" + assert captured["eval_metrics_file_path_or_dir"] == "/tmp/metrics.json" + assert captured["call_agent"] is call_agent + assert captured["callbacks"] is sentinel_callbacks + assert captured["num_runs"] == 3 + assert captured["print_detailed_results"] is False + assert captured["eval_result_output_dir"] is None + + assert outcome.pass_rate == 0.5 + assert outcome.failed_case_ids == ["c2"] + assert outcome.raw_result is fake_result + + +@pytest.mark.asyncio +async def test_run_evaluator_forwards_case_parallelism(monkeypatch): + """spec §3.2: optimize.eval_case_parallelism must reach AgentEvaluator.get_executer.""" + captured: dict = {} + fake_result = _result([_case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)})]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + num_runs=1, + case_parallelism=8, + ) + + assert captured["case_parallelism"] == 8 + + +@pytest.mark.asyncio +async def test_run_evaluator_forwards_print_summary_report_false(monkeypatch): + """Optimizer must keep the evaluator silent so its summary table never + collides with the reporter timeline.""" + captured: dict = {} + fake_result = _result([_case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)})]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert captured["print_detailed_results"] is False + assert captured["print_summary_report"] is False + + +class _AssertingExecuter: + """Mimics AgentEvaluator's pytest-style fail-fast on case failure.""" + + def __init__(self, result: EvaluateResult, message: str) -> None: + self._result = result + self._message = message + + async def evaluate(self) -> None: + from trpc_agent_sdk.evaluation._agent_evaluator import _EvaluationCasesFailed + raise _EvaluationCasesFailed(self._message) + + def get_result(self) -> EvaluateResult: + return self._result + + +@pytest.mark.asyncio +async def test_run_evaluator_swallows_evaluator_assertion_and_returns_outcome(monkeypatch): + fake_result = _result([ + _case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)}), + _case("c2", EvalStatus.FAILED, {"m": (0.2, EvalStatus.FAILED)}), + ]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + return _AssertingExecuter(fake_result, "case c2 failed") + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + outcome = await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert outcome.pass_rate == 0.5 + assert outcome.failed_case_ids == ["c2"] + assert outcome.raw_result is fake_result + + +@pytest.mark.asyncio +async def test_run_evaluator_returns_empty_outcome_when_assertion_loses_result(monkeypatch): + class _LostResultExecuter: + async def evaluate(self) -> None: + from trpc_agent_sdk.evaluation._agent_evaluator import _EvaluationCasesFailed + raise _EvaluationCasesFailed("boom before result populated") + + def get_result(self): + return None + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr( + mod.AgentEvaluator, "get_executer", lambda *a, **k: _LostResultExecuter() + ) + + async def call_agent(q: str) -> str: + return "x" + + outcome = await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert outcome.pass_rate == 0.0 + assert outcome.failed_case_ids == [] + + +@pytest.mark.asyncio +async def test_run_evaluator_does_not_swallow_unrelated_assertion_error(monkeypatch): + """FAIL-3: only ``_EvaluationCasesFailed`` is the business signal. + + Third-party / SDK-internal ``AssertionError`` (numpy ``assert_allclose``, + invariant self-checks, ...) must NOT be silently consumed — that would + hide real bugs behind a 0.0 pass_rate and let the optimizer continue + training against phantom data. + """ + class _BuggyExecuter: + async def evaluate(self) -> None: + # Stand-in for an unrelated assertion failure inside the evaluator + # (e.g. a numpy invariant check, a library bug). + raise AssertionError("invariant violated: this is NOT a case-failure signal") + + def get_result(self): # pragma: no cover - never reached + return None + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr( + mod.AgentEvaluator, "get_executer", lambda *a, **k: _BuggyExecuter() + ) + + async def call_agent(q: str) -> str: + return "x" + + with pytest.raises(AssertionError, match="invariant violated"): + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + +@pytest.mark.asyncio +async def test_run_evaluator_propagates_real_upstream_error(monkeypatch): + """FAIL-3: real upstream errors (FileNotFoundError, network, ...) must + propagate, not be silently turned into an empty outcome. + + The pre-fix code had ``try / except AssertionError / finally: + result = get_result()`` which masked any non-Assertion exception too if + the executer's ``get_result()`` returned None — actually it re-raised, + but the optimizer downstream had no way to distinguish "all cases + silently failed" from "evalset file missing on disk". The post-fix code + propagates these to ``AgentOptimizer.optimize()`` ``run_error`` path so + the run terminates with status=FAILED and the cause is preserved in + ``summary.txt`` rather than silently producing 0.0 pass_rate. + """ + class _BrokenExecuter: + async def evaluate(self) -> None: + raise FileNotFoundError("dataset.evalset.json") + + def get_result(self): # pragma: no cover - never reached + return None + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr( + mod.AgentEvaluator, "get_executer", lambda *a, **k: _BrokenExecuter() + ) + + async def call_agent(q: str) -> str: + return "x" + + with pytest.raises(FileNotFoundError, match="dataset.evalset.json"): + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + +def test_evaluation_cases_failed_is_assertion_error_subclass(): + """FAIL-3: ``_EvaluationCasesFailed`` MUST remain an ``AssertionError`` + subclass so direct ``AgentEvaluator.evaluate()`` callers (e.g. + ``examples/optimization/ci_integration/tests/test_agent_quality.py``) + can keep using ``except AssertionError`` / pytest's native AssertionError + rendering for JUnit XML output without any change.""" + from trpc_agent_sdk.evaluation._agent_evaluator import _EvaluationCasesFailed + err = _EvaluationCasesFailed("failure summary json") + assert isinstance(err, AssertionError) + # Message identity matters for JUnit XML stability. + assert str(err) == "failure summary json" + + +@pytest.mark.asyncio +async def test_eval_executer_raises_evaluation_cases_failed_on_case_failure(tmp_path, monkeypatch): + """FAIL-3 end-to-end: ``_EvalExecuter._run`` MUST raise + ``_EvaluationCasesFailed`` (NOT a bare ``assert False``) when any case + fails. Replacing the bare assert with a real ``raise`` keeps the signal + alive under ``python -O`` — which strips ``assert`` statements — and + avoids piggy-backing business control flow on Python's invariant-check + mechanism. + + We monkeypatch ``evaluate_eval_set`` so this test does not need a real + LLM / runner: the test verifies the post-loop branch in ``_run`` that + converts ``all_failures`` into ``_EvaluationCasesFailed``. + """ + import json as _json + + from trpc_agent_sdk.evaluation._agent_evaluator import ( + AgentEvaluator as _Eval, + ) + from trpc_agent_sdk.evaluation._agent_evaluator import ( + _EvaluationCasesFailed, + ) + from trpc_agent_sdk.evaluation._eval_case import EvalCase + from trpc_agent_sdk.evaluation._eval_case import Invocation + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._eval_set import EvalSet + from trpc_agent_sdk.types import Content + from trpc_agent_sdk.types import Part + + # Build the smallest possible evalset on disk so _run can load it. + eval_set = EvalSet( + eval_set_id="es_fail3", + eval_cases=[ + EvalCase( + eval_id="c1", + conversation=[ + Invocation( + user_content=Content( + role="user", parts=[Part.from_text(text="hi")] + ), + final_response=Content( + role="model", parts=[Part.from_text(text="ack")] + ), + ) + ], + ) + ], + ) + evalset_path = tmp_path / "tiny.evalset.json" + evalset_path.write_text(eval_set.model_dump_json(), encoding="utf-8") + config_path = tmp_path / "test_config.json" + config_path.write_text( + EvalConfig(criteria={"final_response_avg_score": 0.5}).model_dump_json(), + encoding="utf-8", + ) + + async def fake_evaluate_eval_set(eval_set_arg, **kwargs): + # Pretend case c1 failed with a structured summary. + failed_summary = { + "overallStatus": "failed", + "evalCases": [{"evalCaseId": "c1", "overallStatus": "failed"}], + } + return failed_summary, [], [], {"c1": []} + + monkeypatch.setattr(_Eval, "evaluate_eval_set", staticmethod(fake_evaluate_eval_set)) + + async def call_agent(query: str) -> str: + return "ack" + + executer = _Eval.get_executer( + str(evalset_path), + call_agent=call_agent, + print_summary_report=False, + print_detailed_results=False, + ) + + with pytest.raises(_EvaluationCasesFailed) as excinfo: + await executer.evaluate() + + # The error message is the JSON-encoded failure summary — pytest renders + # this verbatim in JUnit XML, so existing CI dashboards keep working. + parsed = _json.loads(str(excinfo.value)) + assert parsed[0]["evalSetId"] == "es_fail3" + assert parsed[0]["summary"]["overallStatus"] == "failed" + + # Back-compat: ``isinstance(err, AssertionError)`` MUST stay True so + # ``examples/optimization/ci_integration`` (``except AssertionError``) + # works unchanged. + assert isinstance(excinfo.value, AssertionError) + + # The result was populated BEFORE the raise (line ordering in _run); + # callers can recover the EvaluateResult even on the failure path. + assert executer.get_result() is not None + + +@pytest.mark.asyncio +async def test_eval_executer_signal_survives_python_O_mode(tmp_path, monkeypatch): + """FAIL-3 python -O coverage: ``_run`` MUST NOT use ``assert`` for the + business signal. We can't actually run pytest under ``-O`` here, but + we can prove the signal does not depend on assertions by checking the + source code contains ``raise _EvaluationCasesFailed`` and NOT + ``assert False`` in the case-failure branch. + + A grep-style guard test is overkill for most things, but ``python -O`` + failures are notoriously hard to reproduce and were the exact root + cause of FAIL-3 — pinning the implementation contract here prevents + a careless future rewrite from reintroducing the bug. + """ + import ast + from pathlib import Path + + source = Path( + "trpc_agent_sdk/evaluation/_agent_evaluator.py" + ).read_text(encoding="utf-8") + assert "raise _EvaluationCasesFailed(combined)" in source, ( + "_run must raise _EvaluationCasesFailed for the case-failure signal" + ) + # Parse the AST and walk every Assert node inside _EvalExecuter._run; + # there MUST be none — case failure must be raised, not asserted. + tree = ast.parse(source) + run_method = None + for node in ast.walk(tree): + if ( + isinstance(node, ast.ClassDef) + and node.name == "_EvalExecuter" + ): + for sub in node.body: + if isinstance(sub, ast.AsyncFunctionDef) and sub.name == "_run": + run_method = sub + break + assert run_method is not None, "could not locate _EvalExecuter._run" + asserts_in_run = [ + n for n in ast.walk(run_method) if isinstance(n, ast.Assert) + ] + assert asserts_in_run == [], ( + f"_EvalExecuter._run MUST NOT contain any ``assert`` statements " + f"(stripped by python -O); found {len(asserts_in_run)} " + f"at lines {[a.lineno for a in asserts_in_run]}" + ) + + +@pytest.mark.asyncio +async def test_run_evaluator_default_num_runs_is_one(monkeypatch): + captured: dict = {} + fake_result = _result([_case("c1", EvalStatus.PASSED, {"m": (0.9, EvalStatus.PASSED)})]) + + def fake_get_executer(eval_dataset_file_path_or_dir, **kwargs): + captured.update(kwargs) + return _FakeExecuter(fake_result) + + from trpc_agent_sdk.evaluation import _optimize_evaluator_call as mod + + monkeypatch.setattr(mod.AgentEvaluator, "get_executer", fake_get_executer) + + async def call_agent(q: str) -> str: + return "x" + + await run_evaluator( + eval_dataset_path="/tmp/x.json", + eval_metrics_path=None, + call_agent=call_agent, + callbacks=None, + ) + + assert captured["num_runs"] == 1 + assert captured["callbacks"] is None + assert captured["eval_metrics_file_path_or_dir"] is None diff --git a/tests/evaluation/test_optimize_gepa_adapter.py b/tests/evaluation/test_optimize_gepa_adapter.py new file mode 100644 index 0000000..e497c74 --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_adapter.py @@ -0,0 +1,1748 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for the gepa protocol adapter and trajectory/feedback helpers.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus +from trpc_agent_sdk.evaluation._eval_result import EvalCaseResult +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResult +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResultDetails +from trpc_agent_sdk.evaluation._eval_result import EvalMetricResultPerInvocation +from trpc_agent_sdk.evaluation._eval_result import EvalSetAggregateResult +from trpc_agent_sdk.evaluation._eval_result import EvaluateResult +from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _AgentGEPAAdapter +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _extract_case_output +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _render_metric_lines +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1", user: str = "hi", expected: str = "ack") -> EvalCase: + return EvalCase( + eval_id=eval_id, + conversation=[_invocation(user, expected)], + ) + + +def _case_result( + eval_id: str, + *, + status: EvalStatus, + metric_score: float, + actual: str, + expected: str = "ack", + reason: Optional[str] = None, + error_message: Optional[str] = None, +) -> EvalCaseResult: + details = EvalMetricResultDetails(reason=reason, score=metric_score) if reason else None + return EvalCaseResult( + eval_id=eval_id, + eval_set_id="optimize_gepa_batch", + final_eval_status=status, + error_message=error_message, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", + threshold=0.7, + score=metric_score, + eval_status=status, + details=details, + ) + ], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=_invocation("hi", actual), + expected_invocation=_invocation("hi", expected), + eval_metric_results=[], + ) + ], + session_id=f"sess-{eval_id}", + ) + + +def _evaluate_result(case_results_by_id: dict[str, list[EvalCaseResult]]) -> EvaluateResult: + return EvaluateResult( + results_by_eval_set_id={ + "optimize_gepa_batch": EvalSetAggregateResult( + eval_results_by_eval_id=case_results_by_id, + num_runs=1, + ) + } + ) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _eval_config() -> EvalConfig: + return EvalConfig(metrics=[{"metric_name": "m1", "threshold": 0.7}], num_runs=1) + + +def _new_target_prompt(write_recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + recorder = write_recorder if write_recorder is not None else {} + + async def read_cb() -> str: + return recorder.get("instruction", "initial") + + async def write_cb(value: str) -> None: + recorder["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +def _multi_component_target_prompt(component_names: list[str]) -> TargetPrompt: + """Register one callback per requested component. + + Each callback writes into an isolated dict so ``write_all`` succeeds for + any candidate whose keys exactly match ``component_names``. Used by + multi-component reflective-dataset tests to exercise the + ``Other Active Components`` injection path. + """ + target = TargetPrompt() + storage: dict[str, str] = {name: "" for name in component_names} + + def _make_pair(name: str): + async def read_cb() -> str: + return storage[name] + + async def write_cb(value: str) -> None: + storage[name] = value + + return read_cb, write_cb + + for name in component_names: + read_cb, write_cb = _make_pair(name) + target.add_callback(name, read=read_cb, write=write_cb) + return target + + +def _patch_run_evaluator(monkeypatch, outcome: EvaluationOutcome) -> dict[str, dict]: + captured: dict[str, dict] = {} + + async def fake_run_evaluator(**kwargs): + captured["kwargs"] = kwargs + eval_dataset_path = kwargs.get("eval_dataset_path") + if eval_dataset_path: + from pathlib import Path + import json + payload = json.loads(Path(eval_dataset_path).read_text(encoding="utf-8")) + captured.setdefault("evalset_id_history", []).append(payload["eval_set_id"]) + captured.setdefault("evalset_payload_history", []).append(payload) + return outcome + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._optimize_gepa_adapter.run_evaluator", + fake_run_evaluator, + ) + return captured + + +def _make_adapter(target: Optional[TargetPrompt] = None, num_runs: int = 1) -> _AgentGEPAAdapter: + return _AgentGEPAAdapter( + target_prompt=target or _new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=num_runs, + ) + + +def test_extract_case_output_reads_first_invocation_final_response(): + case_result = _case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="output text") + assert _extract_case_output(case_result) == "output text" + + +def test_extract_case_output_returns_empty_when_no_invocation(): + case_result = EvalCaseResult( + eval_id="c1", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id="s", + ) + assert _extract_case_output(case_result) == "" + + +# --------------------------------------------------------------------------- +# ``_render_metric_lines`` is the core verdict-line renderer used by every +# Case Body block (per-turn + Overall). Tests below pin the structural +# guarantees the reflection LM relies on: PASS/FAIL labelling, threshold +# emission, judge-vs-synthesized reason precedence, and rubric breakdown. +# --------------------------------------------------------------------------- + + +def _failed_final_response_metric( + *, + text: Optional[dict] = None, + json_cfg: Optional[dict] = None, +) -> EvalMetricResult: + """Build a FAILED final_response_avg_score metric WITHOUT details.reason, + mirroring what the real ``_final_response_evaluator`` actually emits. + Used to exercise the deterministic-reason synthesis path.""" + criterion: dict = {"final_response": {}} + if text is not None: + criterion["final_response"]["text"] = text + if json_cfg is not None: + criterion["final_response"]["json"] = json_cfg + return EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + criterion=criterion, + details=None, + ) + + +def test_render_metric_lines_emits_pass_fail_status_with_threshold_and_score(): + """Each metric occupies one line in the form + ``[PASSED|FAILED] : score=, threshold=``. The + reflection LM uses these markers to (a) decide which metrics to keep + constraints for, (b) tell which metric is being judged.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="m_pass", + threshold=0.7, + score=0.95, + eval_status=EvalStatus.PASSED, + ), + EvalMetricResult( + metric_name="m_fail", + threshold=0.7, + score=0.10, + eval_status=EvalStatus.FAILED, + ), + ] + ) + assert "[PASSED] m_pass: score=0.9500, threshold=0.7000" in lines + assert "[FAILED] m_fail: score=0.1000, threshold=0.7000" in lines + + +def test_render_metric_lines_uses_explicit_judge_reason(): + """LLM-judged metrics already carry a natural-language reason in + ``details.reason``; that reason is surfaced verbatim under the verdict + line so the LM sees the judge's actual diagnosis.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.5, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails( + score=0.0, reason="judge said: missing units" + ), + ) + ] + ) + assert any("reason: judge said: missing units" in line for line in lines) + + +def test_render_metric_lines_synthesizes_reason_for_failing_contains_match(): + """Real deterministic matchers leave ``details.reason`` empty. We synth + a one-line failure explanation from the criterion config so the LM + sees WHY a substring match failed without diffing two long strings.""" + lines = _render_metric_lines( + [ + _failed_final_response_metric( + text={"match": "contains", "case_insensitive": True} + ) + ] + ) + joined = "\n".join(lines) + assert "expected substring not contained" in joined + assert "case-insensitive" in joined + + +def test_render_metric_lines_synthesizes_reason_for_failing_exact_match(): + lines = _render_metric_lines( + [_failed_final_response_metric(text={"match": "exact"})] + ) + joined = "\n".join(lines) + assert "byte-equal" in joined + assert "case-sensitive" in joined + + +def test_render_metric_lines_synthesizes_reason_for_failing_regex_match(): + lines = _render_metric_lines( + [_failed_final_response_metric(text={"match": "regex"})] + ) + assert any("regex" in line for line in lines) + + +def test_render_metric_lines_synthesizes_combined_text_and_json_failure(): + """When a metric runs BOTH text AND json checks the synthesized reason + must say so (joined with AND), otherwise the LM cannot tell which half + of the combined check failed.""" + lines = _render_metric_lines( + [ + _failed_final_response_metric( + text={"match": "exact"}, + json_cfg={"number_tolerance": 0.01}, + ) + ] + ) + joined = "\n".join(lines) + assert "byte-equal" in joined + assert "JSON" in joined + assert "AND" in joined + + +def test_render_metric_lines_no_reason_for_passing_deterministic_metric(): + """Passing metrics with no explicit reason emit no ``reason:`` line — + we only synthesize failure explanations, never invent praise.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=1.0, + eval_status=EvalStatus.PASSED, + criterion={"final_response": {"text": {"match": "contains"}}}, + details=None, + ) + ] + ) + assert not any("reason:" in line for line in lines) + + +def test_render_metric_lines_keeps_explicit_reason_over_synthesis(): + """When details.reason IS present, the explicit text wins — never + overwritten by synthesized criterion text. Guards against an LLM + judge's nuanced verdict being clobbered by template-generated wording.""" + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.5, + score=0.0, + eval_status=EvalStatus.FAILED, + criterion={"llm_judge": {"judge_model": {"model_name": "j1"}}}, + details=EvalMetricResultDetails( + score=0.0, reason="judge said: missing units" + ), + ) + ] + ) + joined = "\n".join(lines) + assert "judge said: missing units" in joined + assert "byte-equal" not in joined + assert "expected substring not contained" not in joined + + +def test_render_metric_lines_expands_rubric_sub_scores(): + """LLM rubric metrics carry per-rubric sub-scores; each rubric must + surface as its own `` Ā· rubric[]: PASS|FAIL ...`` line so the LM + knows which sub-quality is responsible for the verdict.""" + from trpc_agent_sdk.evaluation._llm_criterion import RubricScore + + lines = _render_metric_lines( + [ + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.6667, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails( + score=0.6667, + reason="2/3 rubrics passed", + rubric_scores=[ + RubricScore(id="numeric_correct", score=1.0, reason="answer matches"), + RubricScore(id="reasoning_clear", score=0.0, reason="no calculation steps shown"), + RubricScore(id="units_present", score=1.0, reason="unit present"), + ], + ), + ) + ] + ) + joined = "\n".join(lines) + assert "rubric[numeric_correct]: PASS score=1.00" in joined + assert "rubric[reasoning_clear]: FAIL score=0.00" in joined + assert "rubric[units_present]: PASS score=1.00" in joined + assert "answer matches" in joined + assert "no calculation steps shown" in joined + assert "unit present" in joined + + +def test_adapter_constructor_stores_dependencies(): + target = _new_target_prompt() + config = _eval_config() + adapter = _AgentGEPAAdapter( + target_prompt=target, + eval_config=config, + call_agent=_stub_call_agent, + callbacks=None, + num_runs=3, + ) + assert adapter.target_prompt is target + assert adapter.eval_config is config + assert adapter.num_runs == 3 + + +def test_evaluate_writes_candidate_to_target_prompt(monkeypatch): + case = _eval_case("c1") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + adapter = _make_adapter(target) + + adapter.evaluate(batch=[case], candidate={"instruction": "new prompt text"}) + assert recorder.get("instruction") == "new prompt text" + + +def test_evaluate_passes_correct_kwargs_to_run_evaluator(monkeypatch): + case = _eval_case("c1") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + captured = _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + adapter.evaluate(batch=[case], candidate={"instruction": "x"}) + + kwargs = captured["kwargs"] + # The adapter wraps call_agent in a one-shot return-type sentinel + # (API-A2 fix), so identity equality with the user-provided callable + # no longer holds. Verify the wrapped callable is async and forwards + # the original return value. + import asyncio as _asyncio + import inspect as _inspect + forwarded = kwargs["call_agent"] + assert _inspect.iscoroutinefunction(forwarded) + assert _asyncio.run(forwarded("ping")) == "stub" + assert kwargs["num_runs"] == 2 + assert kwargs["callbacks"] is None + assert kwargs["eval_dataset_path"].endswith(".evalset.json") + assert kwargs["eval_metrics_path"].endswith(".metrics.json") + + +def test_evaluate_scores_reflect_continuous_metric_means(monkeypatch): + """case_score must equal the mean of each metric's continuous score — + NOT a binary pass/fail collapse — so GEPA can distinguish candidates + whose metrics differ in degree but share pass/fail labels.""" + cases = [_eval_case("c1"), _eval_case("c2"), _eval_case("c3")] + outcome = EvaluationOutcome( + pass_rate=1 / 3, + tiebreaker=0.5, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")], + "c2": [_case_result("c2", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong")], + "c3": [_case_result("c3", status=EvalStatus.FAILED, metric_score=0.4, actual="bad")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}) + + assert batch_obj.scores == pytest.approx([0.9, 0.3, 0.4]) + assert len(batch_obj.outputs) == 3 + assert batch_obj.outputs[0] == "ack" + assert batch_obj.outputs[1] == "wrong" + + +def test_evaluate_with_num_runs_averages_continuous_metric_scores(monkeypatch): + """With num_runs > 1, case_score = mean over runs of mean over metrics — + no binary pass-count collapse.""" + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.5, + raw_result=_evaluate_result({ + "c1": [ + _case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ok"), + _case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="bad"), + ], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}) + + # mean([mean([0.9]), mean([0.3])]) = mean([0.9, 0.3]) = 0.6 + assert batch_obj.scores == pytest.approx([0.6]) + + +def test_evaluate_case_score_averages_across_multiple_metrics(monkeypatch): + """When a case carries multiple metrics, case_score = mean of metric scores. + + This is the property GEPA relies on to break ties between candidates that + agree on the binary PASS/FAIL bucket but differ in degree (e.g. one keeps + rubric quality at 1.0 while the other regresses to 0.33).""" + case_result = EvalCaseResult( + eval_id="c_multi", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.3333, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.3333), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c_multi", + ) + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.7, + raw_result=_evaluate_result({"c_multi": [case_result]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c_multi")], candidate={"instruction": "x"} + ) + + # mean([1.0, 0.3333]) ā‰ˆ 0.6667; binary collapse would have produced 0.0 (failed) + assert batch_obj.scores == pytest.approx([0.66665], rel=1e-3) + + +def test_evaluate_populates_objective_scores_per_metric_per_case(monkeypatch): + """objective_scores must be a list aligned with batch order; each entry is + a {metric_name: score} dict — this is the channel GEPA needs to track a + per-objective Pareto frontier.""" + case_1 = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.6667, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=0.6667), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1", + ) + case_2 = EvalCaseResult( + eval_id="c2", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c2", + ) + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.6, + raw_result=_evaluate_result({"c1": [case_1], "c2": [case_2]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1"), _eval_case("c2")], + candidate={"instruction": "x"}, + ) + + assert batch_obj.objective_scores is not None + assert len(batch_obj.objective_scores) == 2 + assert batch_obj.objective_scores[0]["final_response_avg_score"] == pytest.approx(1.0) + assert batch_obj.objective_scores[0]["llm_rubric_response"] == pytest.approx(0.6667, rel=1e-3) + assert batch_obj.objective_scores[1]["final_response_avg_score"] == pytest.approx(0.0) + assert batch_obj.objective_scores[1]["llm_rubric_response"] == pytest.approx(1.0) + + +def test_evaluate_objective_scores_average_across_num_runs(monkeypatch): + """When num_runs > 1, each metric's score in objective_scores must be the + mean of its scores across runs — keeping per-objective signal continuous.""" + run_1 = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.PASSED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", threshold=0.7, score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + EvalMetricResult( + metric_name="m2", threshold=0.5, score=0.6, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=0.6), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-r1", + ) + run_2 = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", threshold=0.7, score=0.4, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.4), + ), + EvalMetricResult( + metric_name="m2", threshold=0.5, score=0.8, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=0.8), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-r2", + ) + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.6, + raw_result=_evaluate_result({"c1": [run_1, run_2]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"} + ) + + assert batch_obj.objective_scores is not None + assert len(batch_obj.objective_scores) == 1 + assert batch_obj.objective_scores[0]["m1"] == pytest.approx(0.7) + assert batch_obj.objective_scores[0]["m2"] == pytest.approx(0.7) + + +def test_evaluate_case_score_separates_candidates_with_same_pass_rate(monkeypatch): + """Two candidates that share the same PASS/FAIL labels on a case but + differ in metric score must end up with different case_scores, so GEPA's + best-candidate selection no longer collapses to ``first-among-ties``.""" + case_a = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=1.0, + eval_status=EvalStatus.PASSED, + details=EvalMetricResultDetails(score=1.0), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-A", + ) + case_b = EvalCaseResult( + eval_id="c1", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="final_response_avg_score", + threshold=1.0, + score=0.0, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.0), + ), + EvalMetricResult( + metric_name="llm_rubric_response", + threshold=0.66, + score=0.3333, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(score=0.3333), + ), + ], + eval_metric_result_per_invocation=[], + session_id="sess-c1-B", + ) + + outcome_a = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.5, + raw_result=_evaluate_result({"c1": [case_a]}), + ) + outcome_b = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.16, + raw_result=_evaluate_result({"c1": [case_b]}), + ) + + _patch_run_evaluator(monkeypatch, outcome_a) + adapter = _make_adapter() + score_a = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "candidate_A"} + ).scores[0] + + _patch_run_evaluator(monkeypatch, outcome_b) + score_b = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "candidate_B"} + ).scores[0] + + # Both candidates fail final_response, but candidate A preserves rubric quality. + # Continuous case_score must reflect this difference (binary collapse would + # have tied both at 0.0). + assert score_a > score_b + assert score_a == pytest.approx(0.5) + assert score_b == pytest.approx(0.16665, rel=1e-3) + + +def test_evaluate_with_capture_traces_returns_trajectories(monkeypatch): + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="not matching")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + assert batch_obj.trajectories is not None + assert len(batch_obj.trajectories) == 1 + traj = batch_obj.trajectories[0] + # Trajectory dict now carries only what ``make_reflective_dataset`` + # actually consumes: the score (for filtering), the captured EvalCase / + # case_runs (for rebuilding the Case Body), and an optional + # error_message for the no-runs evaluator-error path. + assert traj["_case"].eval_id == "c1" + assert len(traj["_case_runs"]) == 1 + assert traj["score"] == pytest.approx(0.3) + assert traj["error_message"] is None + + +def test_evaluate_without_capture_traces_returns_no_trajectories(monkeypatch): + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=False) + assert batch_obj.trajectories is None + + +def test_evaluate_handles_empty_raw_result(monkeypatch): + cases = [_eval_case("c1"), _eval_case("c2")] + outcome = EvaluationOutcome(pass_rate=0.0, tiebreaker=0.0, raw_result=None) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + assert batch_obj.scores == [0.0, 0.0] + assert batch_obj.outputs == ["", ""] + assert batch_obj.trajectories is not None + assert all( + t["error_message"] == "no result returned" for t in batch_obj.trajectories + ) + + +def test_evaluate_handles_case_missing_from_result(monkeypatch): + cases = [_eval_case("c1"), _eval_case("missing")] + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.5, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + assert batch_obj.scores == pytest.approx([0.9, 0.0]) + assert batch_obj.outputs[1] == "" + assert batch_obj.trajectories is not None + assert ( + batch_obj.trajectories[1]["error_message"] + == "case missing from evaluator result" + ) + + +def test_adapter_exposes_propose_new_texts_attribute_as_none(): + # gepa's reflective proposer reads ``adapter.propose_new_texts`` directly; + # the attribute must exist (None signals "use the default reflection LM"). + assert hasattr(_AgentGEPAAdapter, "propose_new_texts") + assert _AgentGEPAAdapter.propose_new_texts is None + + +def test_evaluate_deduplicates_repeated_case_ids_within_batch(monkeypatch): + # gepa's batch sampler pads the minibatch with least-frequent ids when the + # trainset size does not divide the minibatch size, so the same eval_case + # can appear twice in one batch. The evaluator's in-memory manager rejects + # duplicate eval_ids inside an EvalSet, so the adapter must rename repeats. + case = _eval_case("dup") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"dup": [_case_result("dup", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + captured = _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + adapter.evaluate(batch=[case, case], candidate={"instruction": "x"}) + + payload = captured["evalset_payload_history"][0] + ids = [c["eval_id"] for c in payload["eval_cases"]] + assert len(ids) == 2 + assert len(set(ids)) == 2, f"Duplicate eval_ids must be renamed, got {ids}" + + +def test_evaluate_uses_unique_eval_set_id_per_call(monkeypatch): + case = _eval_case("c1") + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=0.9, + raw_result=_evaluate_result({"c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=0.9, actual="ack")]}), + ) + captured = _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + adapter.evaluate(batch=[case], candidate={"instruction": "v1"}) + adapter.evaluate(batch=[case], candidate={"instruction": "v2"}) + + ids = captured["evalset_id_history"] + assert len(ids) == 2 + assert ids[0] != ids[1], "Each call must use a unique eval_set_id to avoid in-memory manager collisions" + + +def test_make_reflective_dataset_collects_failed_cases_only(monkeypatch): + cases = [_eval_case("c1"), _eval_case("c2"), _eval_case("c3")] + outcome = EvaluationOutcome( + pass_rate=1 / 3, + tiebreaker=0.4, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ack")], + "c2": [_case_result("c2", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="bad")], + "c3": [_case_result("c3", status=EvalStatus.FAILED, metric_score=0.4, actual="bad", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + + reflective = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + ) + records = reflective["instruction"] + assert len(records) == 2 + # Turn-sliced schema: case_id, score, Case Body. Other Active Components + # is omitted on single-component candidates. + assert all("case_id" in r for r in records) + assert all("score" in r for r in records) + assert all("Case Body" in r for r in records) + assert all(isinstance(r["Case Body"], str) and r["Case Body"] for r in records) + assert all("Other Active Components" not in r for r in records) + + +def test_make_reflective_dataset_case_body_one_turn_block_per_invocation(monkeypatch): + """Multi-turn case: Case Body contains one ``### Turn N`` block per + invocation, each carrying its own User/Expected lines.""" + multi_turn_case = EvalCase( + eval_id="c_multi_turn", + conversation=[ + _invocation("hello", "hi there"), + _invocation("how are you", "I'm doing fine"), + _invocation("bye", "goodbye"), + ], + ) + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.0, + raw_result=_evaluate_result({ + "c_multi_turn": [_case_result( + "c_multi_turn", status=EvalStatus.FAILED, + metric_score=0.0, actual="wrong", + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[multi_turn_case], candidate={"instruction": "x"}, capture_traces=True + ) + records = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"] + + body = records[0]["Case Body"] + assert "### Turn 1" in body + assert "### Turn 2" in body + assert "### Turn 3" in body + assert "**User**: hello" in body + assert "**Expected**: hi there" in body + assert "**User**: how are you" in body + assert "**Expected**: I'm doing fine" in body + assert "**User**: bye" in body + assert "**Expected**: goodbye" in body + + +def test_make_reflective_dataset_case_body_emits_overall_for_multi_turn(monkeypatch): + """Multi-turn case ends with ``### Overall (case-level aggregate)`` so + the reflection LM sees both per-turn verdicts and the case-level roll-up.""" + multi_turn_case = EvalCase( + eval_id="c_multi", + conversation=[ + _invocation("hi", "ack1"), + _invocation("again", "ack2"), + ], + ) + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.0, + raw_result=_evaluate_result({ + "c_multi": [_case_result( + "c_multi", status=EvalStatus.FAILED, + metric_score=0.0, actual="wrong", + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[multi_turn_case], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + assert "### Overall (case-level aggregate)" in body + + +def test_make_reflective_dataset_case_body_omits_overall_for_single_turn_single_run(monkeypatch): + """Single-turn single-run cases skip the Overall block — Turn 1 already + carries the only verdict, an Overall heading would just repeat it.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + assert "### Turn 1" in body + assert "### Overall" not in body + + +def test_make_reflective_dataset_case_body_nests_run_blocks_for_multi_run(monkeypatch): + """num_runs > 1: each turn block nests ``#### Run N`` sub-blocks so the + reflection LM sees output variance attributed to the right run, without + repeating the shared User/Expected lines per run.""" + run1 = _case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.0, actual="output_run1" + ) + run1.run_id = 1 + run2 = _case_result( + "c1", status=EvalStatus.PASSED, metric_score=1.0, actual="output_run2" + ) + run2.run_id = 2 + outcome = EvaluationOutcome( + pass_rate=0.5, + tiebreaker=0.5, + raw_result=_evaluate_result({"c1": [run1, run2]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter(num_runs=2) + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + + assert "#### Run 1" in body + assert "#### Run 2" in body + assert "**Agent Response**: output_run1" in body + assert "**Agent Response**: output_run2" in body + # Shared User line appears once at the turn level — not once per run. + assert body.count("**User**: hi") == 1 + # Multi-run cases close with per-run aggregate. + assert "### Overall (per-run aggregate)" in body + + +def test_make_reflective_dataset_case_body_renders_tool_trace_inline(monkeypatch): + """Tool calls render as a single-line ``func(arg=val) → result [id=...]`` + so GEPA's H6 markdown cap does not flatten the call/arg/result hierarchy + when the renderer nests them as headers.""" + from trpc_agent_sdk.evaluation._eval_case import IntermediateData + from trpc_agent_sdk.types import FunctionCall, FunctionResponse + + actual = _invocation("query", "I used search") + actual.intermediate_data = IntermediateData( + tool_uses=[ + FunctionCall(id="call_1", name="search", args={"q": "weather"}), + ], + tool_responses=[ + FunctionResponse(id="call_1", name="search", response={"result": "sunny"}), + ], + ) + + case_result = EvalCaseResult( + eval_id="c_tool", + eval_set_id="optimize_gepa_batch", + final_eval_status=EvalStatus.FAILED, + overall_eval_metric_results=[ + EvalMetricResult( + metric_name="m1", threshold=0.7, score=0.3, + eval_status=EvalStatus.FAILED, + details=EvalMetricResultDetails(reason="off", score=0.3), + ) + ], + eval_metric_result_per_invocation=[ + EvalMetricResultPerInvocation( + actual_invocation=actual, + expected_invocation=_invocation("query", "expected"), + eval_metric_results=[], + ) + ], + session_id="sess-c_tool", + ) + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({"c_tool": [case_result]}), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c_tool")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + + assert "**Tool Trace**:" in body + assert "search(q='weather')" in body + assert "'sunny'" in body + assert "[id=call_1]" in body + + +def test_make_reflective_dataset_case_body_omits_tool_trace_when_absent(monkeypatch): + """When the agent did not invoke any tool, the Tool Trace section is + absent — keeps the prompt focused on what the agent actually produced.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + body = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0]["Case Body"] + assert "**Tool Trace**:" not in body + + +def test_make_reflective_dataset_record_carries_case_id_and_score(monkeypatch): + """Per-record meta fields case_id and score let the reflection LM + reference a specific case and see the aggregated case-level score + alongside per-metric breakdown.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c_special": [_case_result("c_special", status=EvalStatus.FAILED, metric_score=0.42, actual="wrong")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c_special")], candidate={"instruction": "x"}, capture_traces=True + ) + record = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0] + assert record["case_id"] == "c_special" + assert record["score"] == pytest.approx(0.42) + + +def test_make_reflective_dataset_other_active_components_present_for_multi_component( + monkeypatch, +): + """Multi-component candidate: each record exposes the OTHER prompts' + current text under ``Other Active Components`` so the reflection LM can + avoid restating requirements already enforced by sibling prompts.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + target = _multi_component_target_prompt(["system_prompt", "skill_prompt"]) + adapter = _make_adapter(target=target) + candidate = { + "system_prompt": "You are a helpful assistant.", + "skill_prompt": "When asked math, always include units.", + } + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate=candidate, capture_traces=True + ) + reflective = adapter.make_reflective_dataset( + candidate=candidate, + eval_batch=batch_obj, + components_to_update=["system_prompt"], + ) + other_md = reflective["system_prompt"][0]["Other Active Components"] + # The sibling prompt's current body is included. + assert "When asked math, always include units." in other_md + assert "### skill_prompt (current)" in other_md + # The target component itself is NOT echoed (GEPA already shows it in ). + assert "system_prompt (current)" not in other_md + + +def test_make_reflective_dataset_other_active_components_absent_for_single_component( + monkeypatch, +): + """Single-component candidate: no ``Other Active Components`` key is + emitted — there is nothing else to surface and the LM should not see an + empty section.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate={"instruction": "x"}, capture_traces=True + ) + record = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0] + assert "Other Active Components" not in record + + +def test_make_reflective_dataset_other_active_components_rebuilt_per_component( + monkeypatch, +): + """When dispatching to multiple components in the same round, each + component's record set must list the OTHER components' content — i.e. + the ``Other Active Components`` field is rebuilt per component, not + shared across them.""" + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + target = _multi_component_target_prompt(["system_prompt", "skill_prompt"]) + adapter = _make_adapter(target=target) + candidate = { + "system_prompt": "SYSTEM BODY", + "skill_prompt": "SKILL BODY", + } + batch_obj = adapter.evaluate( + batch=[_eval_case("c1")], candidate=candidate, capture_traces=True + ) + reflective = adapter.make_reflective_dataset( + candidate=candidate, + eval_batch=batch_obj, + components_to_update=["system_prompt", "skill_prompt"], + ) + + sys_other = reflective["system_prompt"][0]["Other Active Components"] + skill_other = reflective["skill_prompt"][0]["Other Active Components"] + + # Each record set surfaces only the sibling component's body. + assert "SKILL BODY" in sys_other + assert "SYSTEM BODY" not in sys_other + assert "SYSTEM BODY" in skill_other + assert "SKILL BODY" not in skill_other + + +def test_make_reflective_dataset_surfaces_evaluator_error_as_case_body(monkeypatch): + """When the evaluator fails to produce runs for a case (e.g. ``case + missing from evaluator result``), the trajectory entry carries an + ``error_message`` and no ``_case_runs``. The reflective record must + still appear with that error_message as the Case Body, otherwise the + LM silently loses every failed case where the runtime itself broke.""" + cases = [_eval_case("c_missing")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.0, + raw_result=_evaluate_result({}), # no case results at all + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=True + ) + record = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + )["instruction"][0] + assert record["case_id"] == "c_missing" + assert record["score"] == pytest.approx(0.0) + assert "case missing from evaluator result" in record["Case Body"] + + +def test_make_reflective_dataset_returns_empty_for_no_components(): + adapter = _make_adapter() + fake_batch = type("FakeBatch", (), {"trajectories": [{"score": 0.0}]})() + result = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=fake_batch, + components_to_update=[], + ) + assert result == {} + + +def test_make_reflective_dataset_handles_no_trajectories(): + adapter = _make_adapter() + fake_batch = type("FakeBatch", (), {"trajectories": None})() + result = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=fake_batch, + components_to_update=["instruction", "system"], + ) + assert result == {"instruction": [], "system": []} + + +def test_make_reflective_dataset_replicates_records_across_components(monkeypatch): + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.FAILED, metric_score=0.3, actual="wrong", reason="off")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _make_adapter() + batch_obj = adapter.evaluate(batch=cases, candidate={"instruction": "x"}, capture_traces=True) + reflective = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction", "react_skill"], + ) + assert "instruction" in reflective + assert "react_skill" in reflective + assert len(reflective["instruction"]) == 1 + assert len(reflective["react_skill"]) == 1 + + +def test_adapter_records_best_history_per_case(): + """After three _record_history calls the buffer keeps the top-2 by score.""" + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=2, + ) + adapter._record_history(case_id="c1", score=0.4, best_response="hello low") + adapter._record_history(case_id="c1", score=0.9, best_response="hello high") + adapter._record_history(case_id="c1", score=0.6, best_response="hello mid") + + history = adapter._best_history["c1"] + assert len(history) == 2 + assert history[0]["score"] == pytest.approx(0.9) + assert history[0]["best_response"] == "hello high" + assert history[1]["score"] == pytest.approx(0.6) + assert history[1]["best_response"] == "hello mid" + + +def test_adapter_top_k_zero_disables_buffer(): + """top_k=0 is the kill switch — _record_history must be a no-op.""" + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + adapter._record_history(case_id="c1", score=0.9, best_response="hello") + + assert adapter._best_history.get("c1", []) == [] + + +def test_evaluate_populates_best_history_buffer(monkeypatch): + """Running evaluate() twice on the same case accumulates history sorted by score.""" + from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome + + cases = [_eval_case("c1")] + outcome_low = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.3, actual="low" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome_low) + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=2, + ) + adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=False + ) + + outcome_high = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.8, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.8, actual="high" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome_high) + adapter.evaluate( + batch=cases, candidate={"instruction": "y"}, capture_traces=False + ) + + history = adapter._best_history["c1"] + assert len(history) == 2 + assert history[0]["score"] == pytest.approx(0.8) + assert history[0]["best_response"] == "high" + assert history[1]["score"] == pytest.approx(0.3) + assert history[1]["best_response"] == "low" + + +def test_make_reflective_dataset_includes_history_top_k_when_buffer_nonempty( + monkeypatch, +): + """When history is seeded and top_k>0, the record carries a history_top_k list.""" + from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome + + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.3, actual="bad" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=2, + ) + # Seed history with a previous high-score entry the adapter should keep. + adapter._record_history(case_id="c1", score=0.9, best_response="known good") + + batch_obj = adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=True + ) + dataset = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + ) + + records = dataset["instruction"] + assert len(records) == 1 + assert "history_top_k" in records[0] + history = records[0]["history_top_k"] + assert len(history) == 2 # 0.9 seeded + 0.3 from this evaluation + assert history[0]["score"] == pytest.approx(0.9) + assert history[0]["best_response"] == "known good" + assert history[1]["score"] == pytest.approx(0.3) + + +def test_make_reflective_dataset_omits_history_top_k_when_buffer_empty( + monkeypatch, +): + """top_k=0 disables the feature: the record must not carry history_top_k.""" + from trpc_agent_sdk.evaluation._optimize_evaluator_call import EvaluationOutcome + + cases = [_eval_case("c1")] + outcome = EvaluationOutcome( + pass_rate=0.0, + tiebreaker=0.3, + raw_result=_evaluate_result({ + "c1": [_case_result( + "c1", status=EvalStatus.FAILED, metric_score=0.3, actual="bad" + )], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + batch_obj = adapter.evaluate( + batch=cases, candidate={"instruction": "x"}, capture_traces=True + ) + dataset = adapter.make_reflective_dataset( + candidate={"instruction": "x"}, + eval_batch=batch_obj, + components_to_update=["instruction"], + ) + + records = dataset["instruction"] + assert len(records) == 1 + assert "history_top_k" not in records[0] + + +# --------------------------------------------------------------------------- +# Long-lived event loop: call_agent may hold async resources across evaluate() +# calls without hitting "Event loop is closed" (fix for CONC-2). +# --------------------------------------------------------------------------- + + +def test_evaluate_reuses_single_loop_across_calls(monkeypatch) -> None: + """A module-level async resource bound to the loop on first use must + keep working across consecutive evaluate() calls.""" + import asyncio + + outcome = EvaluationOutcome( + pass_rate=1.0, + tiebreaker=1.0, + metric_breakdown={"m1": 1.0}, + failed_case_ids=[], + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ok")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + seen_loops: list[int] = [] + + async def call_agent_with_loop_id(query: str) -> str: + # id(loop) stays constant iff the adapter reuses one loop. + seen_loops.append(id(asyncio.get_running_loop())) + return "stub" + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=call_agent_with_loop_id, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + try: + for _ in range(3): + adapter.evaluate( + batch=[_eval_case()], + candidate={"instruction": "v"}, + ) + finally: + adapter.close() + + # _patch_run_evaluator stubs the actual evaluator path so call_agent + # is not driven; verify the same loop is used by inspecting the + # adapter-owned loop directly across calls. + assert adapter._loop is None # closed after close() + + +def test_evaluate_loop_reuse_supports_module_level_async_client(monkeypatch) -> None: + """A user holding a module-level lock-like async resource that binds to + its first event loop must still work across multiple evaluate() calls.""" + import asyncio + + outcome = EvaluationOutcome( + pass_rate=1.0, tiebreaker=1.0, metric_breakdown={"m1": 1.0}, + failed_case_ids=[], + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ok")], + }), + ) + + captured_loops: list[int] = [] + # Capture the loop id during write_all (driven by adapter's loop). + target = TargetPrompt() + state = {"value": ""} + + async def read_cb() -> str: + return state["value"] + + async def write_cb(value: str) -> None: + captured_loops.append(id(asyncio.get_running_loop())) + state["value"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=target, + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + try: + for i in range(3): + adapter.evaluate( + batch=[_eval_case()], + candidate={"instruction": f"v{i}"}, + ) + finally: + adapter.close() + + # All write_all invocations executed on the same event loop. + assert len(captured_loops) == 3 + assert len(set(captured_loops)) == 1 + + +def test_close_is_idempotent_and_safe_before_evaluate() -> None: + """close() before any evaluate() and double close() must not raise.""" + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + adapter.close() + adapter.close() + + +def test_evaluate_after_close_creates_fresh_loop(monkeypatch) -> None: + """After close(), a subsequent evaluate() must spin up a new loop + (defensive support for callers that reuse an adapter).""" + import asyncio + + outcome = EvaluationOutcome( + pass_rate=1.0, tiebreaker=1.0, metric_breakdown={"m1": 1.0}, + failed_case_ids=[], + raw_result=_evaluate_result({ + "c1": [_case_result("c1", status=EvalStatus.PASSED, metric_score=1.0, actual="ok")], + }), + ) + _patch_run_evaluator(monkeypatch, outcome) + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=_stub_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + try: + adapter.evaluate(batch=[_eval_case()], candidate={"instruction": "v1"}) + old_loop = adapter._loop + first_loop_id = id(old_loop) + adapter.close() + assert adapter._loop is None + adapter.evaluate(batch=[_eval_case()], candidate={"instruction": "v2"}) + assert adapter._loop is not None + assert id(adapter._loop) != first_loop_id + del old_loop + finally: + adapter.close() + + +# --------------------------------------------------------------------------- +# API-A2: call_agent return-type sentinel check (must surface non-str return +# on the first call instead of crashing deep inside metric code). +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_call_agent_returning_non_str_is_rejected_on_first_call(): + """An async callable that returns a non-str value must raise a clear + TypeError on the first invocation, naming the actual returned type. + The check fires through the wrapper installed in _AgentGEPAAdapter.__init__.""" + async def bad_call_agent(query: str): + return 42 # int, not str + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=bad_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + with pytest.raises(TypeError, match="call_agent must return str"): + await adapter.call_agent("hi") + + +@pytest.mark.asyncio +async def test_call_agent_return_check_runs_only_once(): + """The wrapper must only validate on the first successful call to avoid + per-case overhead. After the first call returns a valid str, later calls + bypass the isinstance check entirely (we cannot directly observe this, + but verify functional correctness: subsequent str returns succeed).""" + call_count = {"n": 0} + + async def good_call_agent(query: str): + call_count["n"] += 1 + return "ok" + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=good_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + for _ in range(5): + result = await adapter.call_agent("hi") + assert result == "ok" + assert call_count["n"] == 5 + + +@pytest.mark.asyncio +async def test_call_agent_return_check_does_not_swallow_user_exceptions(): + """If call_agent itself raises, the wrapper must propagate the original + exception (not replace it with a TypeError).""" + async def raising_call_agent(query: str): + raise RuntimeError("user-side failure") + + adapter = _AgentGEPAAdapter( + target_prompt=_new_target_prompt(), + eval_config=_eval_config(), + call_agent=raising_call_agent, + callbacks=None, + num_runs=1, + top_k_per_case=0, + ) + + with pytest.raises(RuntimeError, match="user-side failure"): + await adapter.call_agent("hi") diff --git a/tests/evaluation/test_optimize_gepa_callback.py b/tests/evaluation/test_optimize_gepa_callback.py new file mode 100644 index 0000000..80cde05 --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_callback.py @@ -0,0 +1,667 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for _AgentGEPACallback buffering iteration events as RoundRecords.""" + +from __future__ import annotations + +from trpc_agent_sdk.evaluation._optimize_gepa_callback import _AgentGEPACallback +from trpc_agent_sdk.evaluation._optimize_gepa_callback import _translate_skip_reason + + +def _trigger_iteration( + callback: _AgentGEPACallback, + *, + iteration: int, + candidate: dict, + average_score: float, + is_best: bool = False, + proposal_accepted: bool = False, +) -> None: + callback.on_iteration_start({"iteration": iteration, "state": None, "trainset_loader": None}) + callback.on_valset_evaluated( + { + "iteration": iteration, + "candidate_idx": iteration, + "candidate": candidate, + "scores_by_val_id": {}, + "average_score": average_score, + "num_examples_evaluated": 1, + "total_valset_size": 1, + "parent_ids": [], + "is_best_program": is_best, + "outputs_by_val_id": None, + } + ) + callback.on_iteration_end( + {"iteration": iteration, "state": None, "proposal_accepted": proposal_accepted} + ) + + +def test_callback_starts_with_empty_buffer(): + callback = _AgentGEPACallback() + assert callback.rounds == [] + assert callback.baseline_metric_breakdown == {} + assert callback.baseline_failed_case_ids == [] + assert callback.baseline_pass_rate == 0.0 + + +def test_callback_captures_seed_evaluation_into_baseline_not_rounds(): + """gepa emits ``iteration == 0`` exactly once for the seed candidate. + Callback must store it as baseline rather than appending a RoundRecord.""" + + class _StubOutcome: + metric_breakdown = {"final_response_avg_score": 0.42} + failed_case_ids = ["case-2"] + + class _StubAdapter: + last_outcome = _StubOutcome() + + callback = _AgentGEPACallback(adapter=_StubAdapter()) + callback.on_valset_evaluated( + { + "iteration": 0, + "candidate_idx": 0, + "candidate": {"instruction": "baseline"}, + "scores_by_val_id": {}, + "average_score": 0.42, + "num_examples_evaluated": 1, + "total_valset_size": 1, + "parent_ids": [], + "is_best_program": True, + "outputs_by_val_id": None, + } + ) + + assert callback.rounds == [] + assert callback.baseline_metric_breakdown == {"final_response_avg_score": 0.42} + assert callback.baseline_failed_case_ids == ["case-2"] + assert callback.baseline_pass_rate == 0.42 + + +def test_callback_records_one_round_per_iteration(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.6, + is_best=False, + proposal_accepted=False, + ) + _trigger_iteration( + callback, + iteration=2, + candidate={"instruction": "v2"}, + average_score=0.9, + is_best=True, + proposal_accepted=True, + ) + + assert len(callback.rounds) == 2 + assert callback.rounds[0].round == 1 + assert callback.rounds[0].validation_pass_rate == 0.6 + assert callback.rounds[0].candidate_prompts == {"instruction": "v1"} + assert callback.rounds[0].accepted is False + + assert callback.rounds[1].round == 2 + assert callback.rounds[1].validation_pass_rate == 0.9 + assert callback.rounds[1].candidate_prompts == {"instruction": "v2"} + assert callback.rounds[1].accepted is True + + +def test_callback_acceptance_via_proposal_accepted_only(): + """proposal_accepted=True alone should mark the round accepted.""" + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.7, + is_best=False, + proposal_accepted=True, + ) + assert callback.rounds[0].accepted is True + + +def test_callback_acceptance_follows_proposal_accepted_only(): + """A candidate flagged is_best_program=True without proposal_accepted=True + must not be reported as accepted: the user-facing "accepted" status follows + gepa's proposal_accepted contract so the timeline matches gepa's own + acceptance log. + """ + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.7, + is_best=True, + proposal_accepted=False, + ) + assert callback.rounds[0].accepted is False + + +def test_callback_always_emits_record_even_when_valset_not_evaluated(): + """Iterations rejected by the subsample gate still get a RoundRecord so + round indices in the reporter stay contiguous with gepa iterations. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert len(callback.rounds) == 1 + record = callback.rounds[0] + assert record.round == 1 + assert record.skip_reason == "reflect-LM produced no usable new prompt" + assert record.candidate_prompts == {} + assert record.accepted is False + + +def test_callback_records_candidate_field_names_falls_back_to_candidate_keys(): + """Without an ``on_proposal_end`` event (e.g. merge round, or any path + that bypasses the reflective proposer), ``optimized_field_names`` + falls back to the full candidate key set so result.json never + surfaces an empty list when a candidate exists. + """ + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"system": "s1", "react": "r1"}, + average_score=0.5, + ) + assert set(callback.rounds[0].optimized_field_names) == {"system", "react"} + + +def test_callback_optimized_field_names_uses_proposal_end_components_only(): + """F-3: when ``on_proposal_end`` fires, ``optimized_field_names`` must + reflect ONLY the components rewritten by the reflection LM this + round (gepa's RoundRobin / random component selectors mutate a + subset of the candidate's components per iteration). + + Previously the field reported the full ``candidate.keys()`` list, + misleading users into thinking every component was rewritten each + round when only one (or a subset) actually was. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + # gepa's RoundRobin selector picked only "dim_intent" this round; the + # reflection LM produced exactly one new instruction. + callback.on_proposal_end( + { + "iteration": 1, + "new_instructions": {"dim_intent": "rewritten intent prompt"}, + "subsample_scores_before": [0.5], + "subsample_scores_after": [0.7], + } + ) + callback.on_valset_evaluated( + { + "iteration": 1, + "candidate": { + "system": "s1", + "dim_intent": "rewritten intent prompt", + "dim_slot": "s2", + "dim_response": "r1", + "dim_summary": "su1", + }, + "average_score": 0.7, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": True}) + + assert callback.rounds[0].optimized_field_names == ["dim_intent"] + # candidate_prompts still carries the full candidate (used elsewhere + # for ``best_prompts`` etc.); only the "what was changed this round" + # metadata is narrowed. + assert set(callback.rounds[0].candidate_prompts.keys()) == { + "system", + "dim_intent", + "dim_slot", + "dim_response", + "dim_summary", + } + + +def test_callback_optimized_field_names_resets_between_iterations(): + """``_iter_changed_components`` must reset on ``on_iteration_start``; + a proposal event in iteration N must not leak into iteration N+1's + ``optimized_field_names`` when the next iteration has no proposal + event of its own (e.g. a merge round following a reflective round). + """ + callback = _AgentGEPACallback() + + # Iteration 1: reflective round, only "dim_intent" rewritten. + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_proposal_end( + {"iteration": 1, "new_instructions": {"dim_intent": "v1"}} + ) + callback.on_valset_evaluated( + { + "iteration": 1, + "candidate": {"dim_intent": "v1", "dim_slot": "s0"}, + "average_score": 0.6, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": True}) + + # Iteration 2: merge round — no on_proposal_end, must fall back to + # full candidate keys, NOT reuse iteration 1's ["dim_intent"]. + callback.on_iteration_start({"iteration": 2, "state": None, "trainset_loader": None}) + callback.on_merge_attempted({"iteration": 2}) + callback.on_valset_evaluated( + { + "iteration": 2, + "candidate": {"dim_intent": "v1", "dim_slot": "s0"}, + "average_score": 0.65, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 2, "state": None, "proposal_accepted": True}) + + assert callback.rounds[0].optimized_field_names == ["dim_intent"] + assert set(callback.rounds[1].optimized_field_names) == {"dim_intent", "dim_slot"} + assert callback.rounds[1].kind == "merge" + + +# --------------------------------------------------------------------------- +# on_evaluation_end: parent / candidate subsample-score routing (F-5) +# --------------------------------------------------------------------------- +# +# gepa marks the post-mutation / post-merge eval with ``candidate_idx=None``; +# every other evaluation_end carries an int ``candidate_idx`` and represents +# the parent / current-program eval. Earlier seq-based logic misclassified +# rounds where the reflective proposer picked the seed program (id=0) as +# parent because gepa flags that parent eval with ``is_seed_candidate=True``. + + +def test_on_evaluation_end_records_parent_then_candidate_normal_round(): + """Normal reflective round: parent eval first (int idx), then new + candidate eval (idx=None). Both scores must land on the right slots. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 2, "state": None, "trainset_loader": None}) + # Parent eval (curr_prog_id=5, NOT seed). Use scores that average + # exactly to 0.5 so the assertion is float-safe. + callback.on_evaluation_end( + { + "iteration": 2, + "candidate_idx": 5, + "scores": [0.4, 0.5, 0.6], + "is_seed_candidate": False, + } + ) + # New candidate eval (post-mutation, candidate_idx=None). + callback.on_evaluation_end( + { + "iteration": 2, + "candidate_idx": None, + "scores": [0.8, 0.9, 1.0], + "is_seed_candidate": False, + } + ) + assert callback._iter_train_parent_score == 0.5 # noqa: SLF001 + assert callback._iter_train_candidate_score == 0.9 # noqa: SLF001 + assert callback._iter_train_minibatch_size == 3 # noqa: SLF001 + + +def test_on_evaluation_end_records_correctly_when_parent_is_seed(): + """F-5 regression: when reflective_mutation picks the seed program + (id=0) as parent, the parent eval is flagged ``is_seed_candidate=True``. + Earlier logic dropped that event and shifted the candidate score + into the parent slot — verify the new ``candidate_idx``-based routing + keeps the slots correct. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + # Parent eval — parent IS the seed program. gepa sets is_seed_candidate=True + # here (reflective_mutation.py:283). + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 0, + "scores": [0.5, 0.5], + "is_seed_candidate": True, + } + ) + # New candidate eval. + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": None, + "scores": [0.9, 0.9], + "is_seed_candidate": False, + } + ) + # Parent slot must carry the seed score, NOT the candidate score. + assert callback._iter_train_parent_score == 0.5 # noqa: SLF001 + assert callback._iter_train_candidate_score == 0.9 # noqa: SLF001 + + +def test_on_evaluation_end_merge_round_only_candidate_score(): + """Merge round emits exactly one evaluation_end with ``candidate_idx=None`` + (merge.py:376). Parent slot must stay None — merge has two parents, + a single ``parent_score`` doesn't apply. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 3, "state": None, "trainset_loader": None}) + callback.on_merge_attempted({"iteration": 3}) + callback.on_evaluation_end( + { + "iteration": 3, + "candidate_idx": None, + "scores": [0.7, 0.7, 0.7, 0.7], + "is_seed_candidate": False, + } + ) + assert callback._iter_train_parent_score is None # noqa: SLF001 + assert callback._iter_train_candidate_score == 0.7 # noqa: SLF001 + + +def test_on_evaluation_end_skips_empty_scores(): + """Empty scores carry no information — leave both slots untouched.""" + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 5, + "scores": [], + "is_seed_candidate": False, + } + ) + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": None, + "scores": None, + "is_seed_candidate": False, + } + ) + assert callback._iter_train_parent_score is None # noqa: SLF001 + assert callback._iter_train_candidate_score is None # noqa: SLF001 + + +def test_on_evaluation_end_minibatch_size_set_from_parent_when_unset(): + """When ``on_minibatch_sampled`` did not fire (or fired with empty + list), the parent eval's ``len(scores)`` is the next-best signal for + the round's minibatch size. + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + # No on_minibatch_sampled — minibatch size starts at 0. + assert callback._iter_train_minibatch_size == 0 # noqa: SLF001 + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 5, + "scores": [0.4, 0.5, 0.6], # 3 cases + "is_seed_candidate": False, + } + ) + assert callback._iter_train_minibatch_size == 3 # noqa: SLF001 + + +def test_on_evaluation_end_does_not_overwrite_minibatch_size_from_sampled(): + """If ``on_minibatch_sampled`` already set the minibatch size, + parent eval's score count must NOT clobber it (the sampled event is + authoritative — it counts the FULL minibatch even when the eval + short-circuits a subset). + """ + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_minibatch_sampled( + {"iteration": 1, "minibatch_ids": ["a", "b", "c", "d", "e"], "trainset_size": 5} + ) + assert callback._iter_train_minibatch_size == 5 # noqa: SLF001 + # Parent eval somehow only scored 2 cases — minibatch_size stays 5. + callback.on_evaluation_end( + { + "iteration": 1, + "candidate_idx": 5, + "scores": [0.4, 0.5], + "is_seed_candidate": False, + } + ) + assert callback._iter_train_minibatch_size == 5 # noqa: SLF001 + + +def test_callback_records_duration_seconds_non_negative(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert callback.rounds[0].duration_seconds >= 0.0 + + +def test_callback_reasoning_includes_score(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.8765, + is_best=True, + ) + assert "0.8765" in callback.rounds[0].acceptance_reason + + +class _FakeOutcome: + def __init__(self, metric_breakdown: dict, failed_case_ids: list) -> None: + self.metric_breakdown = metric_breakdown + self.failed_case_ids = failed_case_ids + + +class _FakeAdapter: + def __init__(self, outcome: _FakeOutcome) -> None: + self.last_outcome = outcome + + +class _FakeReflectionLM: + def __init__(self) -> None: + self.total_calls = 0 + self.total_cost = 0.0 + self.total_token_usage = {"prompt": 0, "completion": 0, "total": 0} + + def make_call(self, prompt_tokens: int = 10, completion_tokens: int = 5, cost: float = 0.01) -> None: + self.total_calls += 1 + self.total_cost += cost + self.total_token_usage["prompt"] += prompt_tokens + self.total_token_usage["completion"] += completion_tokens + self.total_token_usage["total"] += prompt_tokens + completion_tokens + + +def test_callback_pulls_metric_breakdown_and_failures_from_adapter(): + """B2: when adapter is supplied, callback fills metric_breakdown / failed_case_ids.""" + outcome = _FakeOutcome( + metric_breakdown={"m1": 0.7, "m2": 0.4}, + failed_case_ids=["c3", "c5"], + ) + adapter = _FakeAdapter(outcome) + callback = _AgentGEPACallback(adapter=adapter) + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.55, + ) + assert callback.rounds[0].metric_breakdown == {"m1": 0.7, "m2": 0.4} + assert callback.rounds[0].failed_case_ids == ["c3", "c5"] + + +def test_callback_records_per_round_reflection_lm_call_delta(): + """Reflection-LM calls/cost/tokens between iteration_start and iteration_end + should land on the produced RoundRecord.""" + lm = _FakeReflectionLM() + callback = _AgentGEPACallback(reflection_lm=lm) + + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + lm.make_call(prompt_tokens=20, completion_tokens=10, cost=0.02) + lm.make_call(prompt_tokens=15, completion_tokens=8, cost=0.015) + callback.on_valset_evaluated( + { + "iteration": 1, + "candidate": {"instruction": "v1"}, + "average_score": 0.7, + "is_best_program": False, + } + ) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + + record = callback.rounds[0] + assert record.reflection_lm_calls == 2 + assert record.round_llm_cost == 0.035 + assert record.round_token_usage == {"prompt": 35, "completion": 18, "total": 53} + + +def test_translate_skip_reason_handles_known_strings(): + assert ( + _translate_skip_reason("no_trajectories") + == "no trajectories captured this round" + ) + assert ( + _translate_skip_reason("all_scores_perfect") + == "minibatch already perfect (skip_perfect_score on)" + ) + # Whitespace / case / dash normalisation. + assert ( + _translate_skip_reason("All-Scores-Perfect") + == "minibatch already perfect (skip_perfect_score on)" + ) + assert ( + _translate_skip_reason(" no_trajectories ") + == "no trajectories captured this round" + ) + + +def test_translate_skip_reason_surfaces_unknown_strings_under_prefix(): + translated = _translate_skip_reason("some_brand_new_reason") + assert translated is not None + assert translated.startswith("gepa-internal:") + assert "some_brand_new_reason" in translated + + +def test_translate_skip_reason_returns_none_for_empty_or_missing(): + assert _translate_skip_reason(None) is None + assert _translate_skip_reason("") is None + assert _translate_skip_reason(" ") is None + + +def test_callback_translates_skip_reason_via_on_evaluation_skipped(): + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_skipped({"reason": "all_scores_perfect"}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert ( + callback.rounds[0].skip_reason + == "minibatch already perfect (skip_perfect_score on)" + ) + + +def test_callback_translates_no_trajectories_skip_reason(): + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_skipped({"reason": "no_trajectories"}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert callback.rounds[0].skip_reason == "no trajectories captured this round" + + +def test_callback_uses_no_proposal_fallback_when_no_event_observed(): + callback = _AgentGEPACallback() + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert callback.rounds[0].skip_reason == "reflect-LM produced no usable new prompt" + + +# --------------------------------------------------------------------------- +# on_valset_breakdown plumb-through (Framework stop policy) +# --------------------------------------------------------------------------- + + +def test_callback_invokes_on_valset_breakdown_for_candidate_iteration(): + received: list[dict] = [] + outcome = _FakeOutcome( + metric_breakdown={"m1": 0.6, "m2": 0.4}, + failed_case_ids=[], + ) + callback = _AgentGEPACallback( + adapter=_FakeAdapter(outcome), + on_valset_breakdown=lambda bd: received.append(bd), + ) + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert received == [{"m1": 0.6, "m2": 0.4}] + + +def test_callback_invokes_on_valset_breakdown_for_baseline_iteration(): + received: list[dict] = [] + outcome = _FakeOutcome( + metric_breakdown={"m1": 0.7}, + failed_case_ids=[], + ) + callback = _AgentGEPACallback( + adapter=_FakeAdapter(outcome), + on_valset_breakdown=lambda bd: received.append(bd), + ) + callback.on_valset_evaluated( + { + "iteration": 0, + "candidate": {"instruction": "baseline"}, + "average_score": 0.7, + "is_best_program": True, + } + ) + assert received == [{"m1": 0.7}] + + +def test_callback_does_not_invoke_on_valset_breakdown_for_skip_without_eval(): + """Iterations that skip valset evaluation (e.g. subsample gate rejected the + candidate) must not push stale breakdowns to the stopper.""" + received: list[dict] = [] + callback = _AgentGEPACallback( + on_valset_breakdown=lambda bd: received.append(bd), + ) + callback.on_iteration_start({"iteration": 1, "state": None, "trainset_loader": None}) + callback.on_evaluation_skipped({"reason": "all_scores_perfect"}) + callback.on_iteration_end({"iteration": 1, "state": None, "proposal_accepted": False}) + assert received == [] + + +def test_callback_on_valset_breakdown_none_is_safe_default(): + callback = _AgentGEPACallback() + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert callback.rounds[0].validation_pass_rate == 0.5 + + +def test_callback_on_valset_breakdown_exceptions_do_not_break_loop(): + def _boom(_bd): + raise RuntimeError("boom") + + outcome = _FakeOutcome(metric_breakdown={"m1": 0.5}, failed_case_ids=[]) + callback = _AgentGEPACallback( + adapter=_FakeAdapter(outcome), + on_valset_breakdown=_boom, + ) + _trigger_iteration( + callback, + iteration=1, + candidate={"instruction": "v1"}, + average_score=0.5, + ) + assert len(callback.rounds) == 1 diff --git a/tests/evaluation/test_optimize_gepa_e2e.py b/tests/evaluation/test_optimize_gepa_e2e.py new file mode 100644 index 0000000..f225a76 --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_e2e.py @@ -0,0 +1,210 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""End-to-end registration test: config -> registry.get -> instantiate -> run.""" + +from __future__ import annotations + +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._eval_set import EvalSet +from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfig +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import GepaReflectiveOptimizer +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +# --------------------------------------------------------------------------- +# Fixtures shared by the e2e flow +# --------------------------------------------------------------------------- + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1") -> EvalCase: + return EvalCase(eval_id=eval_id, conversation=[_invocation("hi", "ack")]) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _new_target_prompt(recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + state = recorder if recorder is not None else {} + + async def read_cb() -> str: + return state.get("instruction", "initial") + + async def write_cb(value: str) -> None: + state["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +def _make_config() -> OptimizeConfigFile: + return OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{"metric_name": "m1", "threshold": 0.7}], + num_runs=1, + ), + optimize=OptimizeConfig( + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="test-key", + ), + max_metric_calls=30, + ), + ), + ) + + +class _FakeGEPAResult: + def __init__(self, candidates, val_scores): + self.candidates = candidates + self.val_aggregate_scores = val_scores + self.parents = [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = [0] * len(candidates) + self.total_metric_calls = 0 + self.best_outputs_valset = None + + @property + def best_idx(self) -> int: + return max( + range(len(self.val_aggregate_scores)), + key=lambda i: self.val_aggregate_scores[i], + ) + + +# --------------------------------------------------------------------------- +# Registration contract: importing evaluation package registers algorithms +# --------------------------------------------------------------------------- + + +def test_evaluation_package_import_registers_gepa_reflective(): + """Importing the evaluation package triggers algorithm registration. + + Business code only needs ``import trpc_agent_sdk.evaluation`` to make + ``OPTIMIZER_REGISTRY.get("gepa_reflective")`` work; algorithm modules do + NOT register themselves as a side-effect of bare ``_optimize_gepa_*`` + imports. + """ + import trpc_agent_sdk.evaluation # noqa: F401 triggers registrations + + assert "gepa_reflective" in OPTIMIZER_REGISTRY.list_registered() + assert OPTIMIZER_REGISTRY.get("gepa_reflective") is GepaReflectiveOptimizer + + +def test_registry_lookup_unknown_algorithm_lists_available(): + import trpc_agent_sdk.evaluation # noqa: F401 + + with pytest.raises(ValueError) as exc_info: + OPTIMIZER_REGISTRY.get("not_a_real_algorithm") + + msg = str(exc_info.value) + assert "not_a_real_algorithm" in msg + assert "gepa_reflective" in msg + + +# --------------------------------------------------------------------------- +# End-to-end flow: config -> registry.get -> instantiate -> run +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_e2e_config_to_run_succeeds(tmp_path, monkeypatch): + """Simulate the business-side entry point: + + 1. Parse OptimizeConfigFile (algorithm.name = "gepa_reflective"). + 2. Look up class via OPTIMIZER_REGISTRY.get(name). + 3. Instantiate with the supplied call_agent / target_prompt / datasets. + 4. await optimizer.run() → OptimizeResult with status="SUCCEEDED". + """ + import trpc_agent_sdk.evaluation # noqa: F401 + + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + config = _make_config() + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + + algorithm_cls = OPTIMIZER_REGISTRY.get(config.optimize.algorithm.name) + optimizer = algorithm_cls( + config=config, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_scores=[0.5, 0.9], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.status == "SUCCEEDED" + assert result.best_pass_rate == pytest.approx(0.9) + assert result.best_prompts == {"instruction": "improved"} + # BaseOptimizer.run() never writes back; write-back is owned by the + # AgentOptimizer facade and gated by ``update_source``. + assert result.best_prompts["instruction"] == "improved" + + +@pytest.mark.asyncio +async def test_e2e_registry_returns_instantiable_class(): + """Class returned by registry can be instantiated with the standard kwargs.""" + import trpc_agent_sdk.evaluation # noqa: F401 + + config = _make_config() + target = _new_target_prompt() + + cls = OPTIMIZER_REGISTRY.get("gepa_reflective") + instance = cls( + config=config, + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path="/tmp/train.json", + validation_dataset_path="/tmp/val.json", + ) + + assert isinstance(instance, GepaReflectiveOptimizer) + assert instance.config is config + assert instance.target_prompt is target diff --git a/tests/evaluation/test_optimize_gepa_reflective.py b/tests/evaluation/test_optimize_gepa_reflective.py new file mode 100644 index 0000000..1166c19 --- /dev/null +++ b/tests/evaluation/test_optimize_gepa_reflective.py @@ -0,0 +1,1628 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for GepaReflectiveOptimizer and its GEPAResult->OptimizeResult helpers.""" + +from __future__ import annotations + +from datetime import datetime +from datetime import timezone +from typing import Optional + +import pytest + +from trpc_agent_sdk.evaluation._eval_case import EvalCase +from trpc_agent_sdk.evaluation._eval_case import Invocation +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._eval_set import EvalSet +from trpc_agent_sdk.evaluation._optimize_config import FrameworkStopConfig +from trpc_agent_sdk.evaluation._optimize_config import GepaReflectiveAlgo +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfig +from trpc_agent_sdk.evaluation._optimize_config import OptimizeConfigFile +from trpc_agent_sdk.evaluation._optimize_gepa_adapter import _AgentGEPAAdapter +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import GepaReflectiveOptimizer +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + _RequiredMetricsAboveThresholdStopper, +) +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _LabeledStopper +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _build_failed_result +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _build_optimize_result +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _build_stop_callbacks +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _classify_stop_reason +from trpc_agent_sdk.evaluation._optimize_gepa_reflective import _load_evalset_cases +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +# --------------------------------------------------------------------------- +# Shared fixtures and helpers +# --------------------------------------------------------------------------- + + +def _invocation(user_text: str, response_text: Optional[str] = None) -> Invocation: + final_response = ( + Content(role="model", parts=[Part.from_text(text=response_text)]) + if response_text is not None + else None + ) + return Invocation( + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=final_response, + ) + + +def _eval_case(eval_id: str = "c1", user: str = "hi", expected: str = "ack") -> EvalCase: + return EvalCase( + eval_id=eval_id, + conversation=[_invocation(user, expected)], + ) + + +async def _stub_call_agent(query: str) -> str: + return "stub" + + +def _new_target_prompt(write_recorder: Optional[dict[str, str]] = None) -> TargetPrompt: + target = TargetPrompt() + recorder = write_recorder if write_recorder is not None else {} + + async def read_cb() -> str: + return recorder.get("instruction", "initial") + + async def write_cb(value: str) -> None: + recorder["instruction"] = value + + target.add_callback("instruction", read=read_cb, write=write_cb) + return target + + +class _FakeGEPAResult: + """Minimal stand-in for gepa.core.result.GEPAResult used by mapping tests.""" + + def __init__( + self, + *, + candidates, + val_aggregate_scores, + parents=None, + discovery_eval_counts=None, + total_metric_calls=None, + best_outputs_valset=None, + per_objective_best_candidates=None, + ): + self.candidates = candidates + self.val_aggregate_scores = val_aggregate_scores + self.parents = parents or [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = discovery_eval_counts or [0] * len(candidates) + self.total_metric_calls = total_metric_calls + self.best_outputs_valset = best_outputs_valset + # GEPA's actual GEPAResult field is dict[str, set[int]] | None + self.per_objective_best_candidates = per_objective_best_candidates + + @property + def best_idx(self) -> int: + return max(range(len(self.val_aggregate_scores)), key=lambda i: self.val_aggregate_scores[i]) + + @property + def best_candidate(self): + return self.candidates[self.best_idx] + + +def _make_config(*, max_metric_calls: int = 30, **algo_overrides) -> OptimizeConfigFile: + return OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{"metric_name": "m1", "threshold": 0.7}], + num_runs=1, + ), + optimize=OptimizeConfig( + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="test-key", + ), + max_metric_calls=max_metric_calls, + **algo_overrides, + ), + ), + ) + + +def _make_optimizer(target=None, train_path="/tmp/train.json", val_path="/tmp/val.json"): + target = target or _new_target_prompt() + return GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=train_path, + validation_dataset_path=val_path, + ) + + +# --------------------------------------------------------------------------- +# _load_evalset_cases +# --------------------------------------------------------------------------- + + +def test_load_evalset_cases_reads_from_evalset_json(tmp_path): + evalset = EvalSet( + eval_set_id="train", + eval_cases=[_eval_case("c1"), _eval_case("c2")], + ) + file_path = tmp_path / "train.evalset.json" + file_path.write_text(evalset.model_dump_json(), encoding="utf-8") + + cases = _load_evalset_cases(str(file_path)) + assert len(cases) == 2 + assert {c.eval_id for c in cases} == {"c1", "c2"} + + +def test_load_evalset_cases_raises_for_missing_file(): + with pytest.raises(FileNotFoundError): + _load_evalset_cases("/nonexistent/path.json") + + +# --------------------------------------------------------------------------- +# _build_stop_callbacks +# --------------------------------------------------------------------------- + + +def _disabled_stop_cfg() -> FrameworkStopConfig: + return FrameworkStopConfig(required_metrics=None) + + +def test_build_stop_callbacks_includes_each_configured_stopper(): + """One stopper instance per configured stop field; unset fields stay off. + + Every gepa stopper is wrapped by ``_LabeledStopper`` so the optimizer can + classify ``stop_reason`` after gepa returns; the inner gepa class is + reached via ``stopper._inner`` and the label is exposed via + ``stopper.label``. + """ + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + max_iterations_without_improvement=3, + timeout_seconds=60.0, + score_threshold=0.95, + max_candidate_proposals=20, + max_tracked_candidates=12, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, _disabled_stop_cfg(), {} + ) + assert framework_stopper is None + labeled_stoppers = [s for s in stoppers if isinstance(s, _LabeledStopper)] + inner_class_names = {type(s._inner).__name__ for s in labeled_stoppers} + assert "MaxMetricCallsStopper" in inner_class_names + assert "NoImprovementStopper" in inner_class_names + assert "TimeoutStopCondition" in inner_class_names + assert "ScoreThresholdStopper" in inner_class_names + assert "MaxCandidateProposalsStopper" in inner_class_names + assert "MaxTrackedCandidatesStopper" in inner_class_names + + +def test_build_stop_callbacks_emits_only_configured_stoppers(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + timeout_seconds=30.0, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, _disabled_stop_cfg(), {} + ) + assert framework_stopper is None + assert len(stoppers) == 1 + assert isinstance(stoppers[0], _LabeledStopper) + assert type(stoppers[0]._inner).__name__ == "TimeoutStopCondition" + assert stoppers[0].label == "timeout" + + +def test_build_stop_callbacks_adds_required_metrics_stopper_for_all(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics="all"), + {"m1": 0.5, "m2": 0.3}, + ) + assert isinstance(framework_stopper, _RequiredMetricsAboveThresholdStopper) + assert framework_stopper in stoppers + assert framework_stopper._thresholds == {"m1": 0.5, "m2": 0.3} + + +def test_build_stop_callbacks_adds_required_metrics_stopper_for_subset_list(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=["m1"]), + {"m1": 0.5, "m2": 0.3}, + ) + assert isinstance(framework_stopper, _RequiredMetricsAboveThresholdStopper) + assert framework_stopper._thresholds == {"m1": 0.5} + + +def test_build_stop_callbacks_skips_framework_stopper_when_disabled(): + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + {"m1": 0.5}, + ) + assert framework_stopper is None + assert all( + not isinstance(s, _RequiredMetricsAboveThresholdStopper) for s in stoppers + ) + + +def test_build_stop_callbacks_skips_framework_stopper_when_thresholds_empty(): + """Even with required_metrics='all', if metric_thresholds is empty the + resolved subset is empty and the stopper would be a no-op; skip it.""" + pytest.importorskip("gepa") + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(model_name="m", api_key="k"), + max_metric_calls=10, + ) + stoppers, framework_stopper = _build_stop_callbacks( + algo, FrameworkStopConfig(required_metrics="all"), {} + ) + assert framework_stopper is None + + +# --------------------------------------------------------------------------- +# _RequiredMetricsAboveThresholdStopper +# --------------------------------------------------------------------------- + + +def test_required_metrics_stopper_returns_false_before_first_update(): + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5}) + assert stopper(gepa_state=None) is False + assert stopper.last_triggered is False + + +def test_required_metrics_stopper_triggers_when_all_pass(): + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5, "m2": 0.3}) + stopper.update({"m1": 0.6, "m2": 0.4}) + assert stopper(gepa_state=None) is True + assert stopper.last_triggered is True + + +def test_required_metrics_stopper_does_not_trigger_when_one_below(): + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5, "m2": 0.3}) + stopper.update({"m1": 0.6, "m2": 0.2}) + assert stopper(gepa_state=None) is False + assert stopper.last_triggered is False + + +def test_required_metrics_stopper_last_triggered_is_sticky(): + """Once triggered, last_triggered remains True even if subsequent updates + fall back below thresholds (helps the run() stop_reason decision).""" + stopper = _RequiredMetricsAboveThresholdStopper({"m1": 0.5}) + stopper.update({"m1": 0.7}) + stopper(gepa_state=None) + assert stopper.last_triggered is True + stopper.update({"m1": 0.1}) + stopper(gepa_state=None) + assert stopper.last_triggered is True + + +def test_required_metrics_stopper_empty_thresholds_never_triggers(): + stopper = _RequiredMetricsAboveThresholdStopper({}) + stopper.update({"m1": 0.9}) + assert stopper(gepa_state=None) is False + + +# --------------------------------------------------------------------------- +# _build_optimize_result +# --------------------------------------------------------------------------- + + +def test_build_optimize_result_maps_best_and_baseline(): + baseline = {"instruction": "baseline text"} + candidates = [ + {"instruction": "baseline text"}, + {"instruction": "candidate v1"}, + {"instruction": "candidate v2 (best)"}, + ] + gepa_result = _FakeGEPAResult( + candidates=candidates, + val_aggregate_scores=[0.5, 0.6, 0.9], + total_metric_calls=42, + ) + + started = datetime(2026, 5, 15, 10, 0, 0, tzinfo=timezone.utc) + finished = datetime(2026, 5, 15, 10, 5, 0, tzinfo=timezone.utc) + + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=candidates[2], + reflection_lm_cost=1.23, + started_at=started, + finished_at=finished, + algo_name="gepa_reflective", + ) + + assert result.status == "SUCCEEDED" + assert result.finish_reason == "completed" + assert result.baseline_pass_rate == pytest.approx(0.5) + assert result.best_pass_rate == pytest.approx(0.9) + assert result.pass_rate_improvement == pytest.approx(0.4) + assert result.baseline_prompts == baseline + assert result.best_prompts == candidates[2] + assert result.total_rounds == 2 + assert result.total_llm_cost == pytest.approx(1.23) + assert result.algorithm == "gepa_reflective" + assert result.extras["total_metric_calls"] == 42 + + +def test_build_optimize_result_produces_round_records(): + baseline = {"instruction": "v0"} + candidates = [ + {"instruction": "v0"}, + {"instruction": "v1"}, + {"instruction": "v2"}, + ] + gepa_result = _FakeGEPAResult( + candidates=candidates, + val_aggregate_scores=[0.3, 0.7, 0.5], + ) + + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=candidates[1], + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + ) + + assert len(result.rounds) == 2 + round1 = result.rounds[0] + assert round1.round == 1 + assert round1.candidate_prompts == candidates[1] + assert round1.validation_pass_rate == pytest.approx(0.7) + assert round1.accepted is True + + round2 = result.rounds[1] + assert round2.round == 2 + assert round2.candidate_prompts == candidates[2] + assert round2.accepted is False + + +def test_build_optimize_result_forwards_metric_thresholds(): + """metric_thresholds gets copied through to OptimizeResult so reporters and + summary.txt can show baseline / best alongside the per-metric PASS bar.""" + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, {"instruction": "v1"}], + val_aggregate_scores=[0.4, 0.9], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate={"instruction": "v1"}, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + metric_thresholds={ + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + }, + ) + assert result.metric_thresholds == { + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + } + + +def test_build_failed_result_carries_metric_thresholds(): + """Even on FAILED runs the user should still see the configured thresholds + so summary.txt does not look like the metrics had no acceptance bar at all. + """ + result = _build_failed_result( + baseline_prompts={"instruction": "v0"}, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + error_message="boom", + algo_name="gepa_reflective", + metric_thresholds={"final_response_avg_score": 0.5}, + ) + assert result.status == "FAILED" + assert result.metric_thresholds == {"final_response_avg_score": 0.5} + + +def test_build_optimize_result_forwards_baseline_and_best_breakdowns(): + """B1: baseline_metric_breakdown is passed through; best_metric_breakdown is + pulled from the round whose candidate_prompts matches best_candidate.""" + from trpc_agent_sdk.evaluation._optimize_result import RoundRecord + + baseline = {"instruction": "v0"} + candidates = [baseline, {"instruction": "v1"}, {"instruction": "v2"}] + gepa_result = _FakeGEPAResult( + candidates=candidates, + val_aggregate_scores=[0.4, 0.6, 0.9], + ) + callback_rounds = [ + RoundRecord( + round=1, + optimized_field_names=["instruction"], + candidate_prompts=candidates[1], + train_pass_rate=0.0, + validation_pass_rate=0.6, + metric_breakdown={"final_response_avg_score": 0.6}, + accepted=False, + acceptance_reason="explored", + started_at="2026-05-17T10:00:00Z", + duration_seconds=1.0, + ), + RoundRecord( + round=2, + optimized_field_names=["instruction"], + candidate_prompts=candidates[2], + train_pass_rate=0.0, + validation_pass_rate=0.9, + metric_breakdown={"final_response_avg_score": 0.9}, + accepted=True, + acceptance_reason="best", + started_at="2026-05-17T10:00:02Z", + duration_seconds=1.0, + ), + ] + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=candidates[2], + reflection_lm_cost=0.5, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + callback_rounds=callback_rounds, + baseline_metric_breakdown={"final_response_avg_score": 0.4}, + total_reflection_lm_calls=5, + total_judge_model_calls=12, + total_judge_cost=0.25, + total_token_usage={"prompt": 100, "completion": 50, "total": 150}, + ) + + assert result.baseline_metric_breakdown == {"final_response_avg_score": 0.4} + assert result.best_metric_breakdown == {"final_response_avg_score": 0.9} + assert result.total_reflection_lm_calls == 5 + assert result.total_judge_model_calls == 12 + assert result.total_llm_cost == pytest.approx(0.75) # 0.5 (reflection) + 0.25 (judge) + assert result.total_token_usage == {"prompt": 100, "completion": 50, "total": 150} + + +def test_build_optimize_result_forwards_stop_reason(): + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, {"instruction": "v1"}], + val_aggregate_scores=[0.4, 0.9], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate={"instruction": "v1"}, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + stop_reason="required_metrics_passing", + ) + assert result.stop_reason == "required_metrics_passing" + + +def test_build_optimize_result_stop_reason_defaults_to_none(): + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, {"instruction": "v1"}], + val_aggregate_scores=[0.4, 0.9], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate={"instruction": "v1"}, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + ) + assert result.stop_reason is None + + +def test_build_optimize_result_pass_rate_improvement_can_be_zero(): + baseline = {"instruction": "v"} + gepa_result = _FakeGEPAResult( + candidates=[baseline, dict(baseline)], + val_aggregate_scores=[0.8, 0.8], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + ) + assert result.pass_rate_improvement == pytest.approx(0.0) + + +def test_build_optimize_result_mirrors_baseline_breakdown_when_baseline_is_best(): + """R2: when ``best_idx == 0`` (gepa found no improvement), the + iteration-0 baseline evaluation is recorded as + ``baseline_metric_breakdown`` rather than as a RoundRecord, so the + rounds list never contains a record matching the seed prompts. + Without the fallback, ``best_metric_breakdown`` would stay empty and + ``summary.txt`` would render the ``best`` column as ``nan``, looking + like data loss instead of "no improvement". + """ + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline], # only the seed candidate + val_aggregate_scores=[0.6667], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, # baseline IS the best + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + baseline_metric_breakdown={ + "final_response_avg_score": 0.6667, + "tool_trajectory_avg_score": 0.5, + }, + ) + + assert result.best_metric_breakdown == { + "final_response_avg_score": 0.6667, + "tool_trajectory_avg_score": 0.5, + } + # And it should match the baseline breakdown 1:1. + assert result.best_metric_breakdown == result.baseline_metric_breakdown + + +def test_build_optimize_result_does_not_mirror_when_a_round_already_matches(): + """The mirror-from-baseline fallback must NOT overwrite a real round + breakdown — if a RoundRecord matches ``best_candidate`` (e.g. the + candidate happens to equal baseline as a string but a round still + re-evaluated it on the valset), prefer the round's actual + metric_breakdown. + """ + baseline = {"instruction": "v0"} + # callback_rounds carries a record matching baseline with REAL data. + from trpc_agent_sdk.evaluation._optimize_result import RoundRecord + + callback_rounds = [ + RoundRecord( + round=1, + optimized_field_names=["instruction"], + candidate_prompts=baseline, + train_pass_rate=0.0, + validation_pass_rate=0.6667, + metric_breakdown={"final_response_avg_score": 0.7}, + accepted=False, + acceptance_reason="explored", + started_at=datetime.now(timezone.utc).isoformat(), + duration_seconds=0.1, + ), + ] + gepa_result = _FakeGEPAResult( + candidates=[baseline, baseline], + val_aggregate_scores=[0.6667, 0.6667], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + callback_rounds=callback_rounds, + baseline_metric_breakdown={"final_response_avg_score": 0.0}, # different! + ) + + # Round's real data wins; baseline_metric_breakdown is NOT used. + assert result.best_metric_breakdown == {"final_response_avg_score": 0.7} + + +def test_build_optimize_result_no_mirror_when_baseline_breakdown_empty(): + """When both ``baseline_metric_breakdown`` and any matching round + record are empty, ``best_metric_breakdown`` stays empty — there is + simply no data to mirror. + """ + baseline = {"instruction": "v0"} + gepa_result = _FakeGEPAResult( + candidates=[baseline], + val_aggregate_scores=[0.0], + ) + result = _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline, + best_candidate=baseline, + reflection_lm_cost=0.0, + started_at=datetime.now(timezone.utc), + finished_at=datetime.now(timezone.utc), + algo_name="gepa_reflective", + # baseline_metric_breakdown is omitted (None → empty dict) + ) + + assert result.best_metric_breakdown == {} + + +# --------------------------------------------------------------------------- +# _build_failed_result +# --------------------------------------------------------------------------- + + +def test_build_failed_result_marks_status_failed(): + baseline = {"instruction": "v0"} + started = datetime(2026, 5, 15, 10, 0, 0, tzinfo=timezone.utc) + finished = datetime(2026, 5, 15, 10, 0, 1, tzinfo=timezone.utc) + + result = _build_failed_result( + baseline_prompts=baseline, + started_at=started, + finished_at=finished, + error_message="boom", + algo_name="gepa_reflective", + ) + + assert result.status == "FAILED" + assert result.finish_reason == "error" + assert result.error_message == "boom" + assert result.baseline_prompts == baseline + assert result.best_prompts == baseline + assert result.baseline_pass_rate == 0.0 + assert result.best_pass_rate == 0.0 + assert result.total_rounds == 0 + assert result.algorithm == "gepa_reflective" + + +# --------------------------------------------------------------------------- +# GepaReflectiveOptimizer construction and run +# --------------------------------------------------------------------------- + + +def test_optimizer_constructor_stores_dataset_paths(): + optimizer = _make_optimizer(train_path="/tmp/t.json", val_path="/tmp/v.json") + assert optimizer.train_dataset_path == "/tmp/t.json" + assert optimizer.validation_dataset_path == "/tmp/v.json" + + +@pytest.mark.asyncio +async def test_optimizer_run_returns_best_without_writing_back(tmp_path, monkeypatch): + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.9], + total_metric_calls=20, + ) + + captured: dict = {} + + async def fake_call_gepa(self, **kwargs): + captured["kwargs"] = kwargs + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.status == "SUCCEEDED" + assert result.best_pass_rate == pytest.approx(0.9) + assert result.best_prompts == {"instruction": "improved"} + # BaseOptimizer.run() must not write back; the AgentOptimizer facade is the + # sole owner of the write-back path (gated by ``update_source``). + # The recorder may stay empty here because gepa.optimize is mocked and never + # actually invokes adapter.evaluate(...); what matters is that ``result`` + # exposes the best prompts without persisting them. + assert recorder.get("instruction") != "improved" + + kwargs = captured["kwargs"] + assert kwargs["seed_candidate"] == {"instruction": "initial"} + assert len(kwargs["trainset"]) == 1 + assert len(kwargs["valset"]) == 1 + assert kwargs["reflection_lm"] is not None + assert isinstance(kwargs["adapter"], _AgentGEPAAdapter) + assert kwargs["candidate_selection_strategy"] == "pareto" + assert kwargs["module_selector"] == "round_robin" + assert kwargs["seed"] == 42 + # The reflection prompt template must reach gepa.optimize and keep both + # placeholders so GEPA's InstructionProposalSignature validation passes. + template = kwargs.get("reflection_prompt_template", "") + assert "" in template + assert "" in template + + +@pytest.mark.asyncio +async def test_optimizer_run_injects_metric_reference_doc_into_reflection_template( + tmp_path, monkeypatch +): + """For built-in criterion-based metrics, the metric reference doc must + travel into gepa.optimize's reflection_prompt_template so the reflection + LM understands what every per-case feedback row means.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + # Use a real criterion-based built-in metric so the doc renders actual + # content (not the empty-doc fallback path covered by the previous test). + config = OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "contains"}}}, + }], + num_runs=1, + ), + optimize=OptimizeConfig( + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="test-key", + ), + max_metric_calls=30, + ), + ), + ) + optimizer = GepaReflectiveOptimizer( + config=config, + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + captured: dict = {} + + async def fake_call_gepa(self, **kwargs): + captured["kwargs"] = kwargs + return _FakeGEPAResult( + candidates=[{"instruction": "initial"}], + val_aggregate_scores=[1.0], + total_metric_calls=10, + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + await optimizer.run() + + template = captured["kwargs"]["reflection_prompt_template"] + # Required GEPA placeholders preserved + assert "" in template + assert "" in template + # The injected metric doc surfaces its metric name and config knobs + assert "final_response_avg_score" in template + assert "contains" in template + # The metric doc sits between and + assert template.index("") < template.index("final_response_avg_score") + assert template.index("final_response_avg_score") < template.index("") + + +@pytest.mark.asyncio +async def test_optimizer_run_surfaces_per_metric_best_candidates(tmp_path, monkeypatch): + """When GEPA reports per_objective_best_candidates, OptimizeResult must + forward it (converting set -> sorted list) so users can see which + candidate excels on which metric independent of the aggregated best.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.4, 0.9], + total_metric_calls=20, + per_objective_best_candidates={ + "final_response_avg_score": {1}, + "llm_rubric_response": {0, 1}, + }, + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.per_metric_best_candidates == { + "final_response_avg_score": [1], + "llm_rubric_response": [0, 1], + } + + +@pytest.mark.asyncio +async def test_optimizer_run_per_metric_best_candidates_empty_when_gepa_omits_it( + tmp_path, monkeypatch +): + """Older GEPA builds or algorithms without per-objective tracking return + ``per_objective_best_candidates=None``; OptimizeResult must keep an empty + dict (not raise) so consumers can rely on the field always being a dict.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "x"}], + val_aggregate_scores=[0.5], + total_metric_calls=5, + per_objective_best_candidates=None, + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + assert result.per_metric_best_candidates == {} + + +@pytest.mark.asyncio +async def test_optimizer_run_returns_failed_when_baseline_evaluation_raises(tmp_path, monkeypatch): + """If the explicit baseline evaluation throws, surface a FAILED result with + the captured error message instead of propagating a raw exception.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + recorder: dict[str, str] = {} + target = _new_target_prompt(recorder) + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + def explode(self, *args, **kwargs): + raise RuntimeError("evaluator exploded during baseline") + + monkeypatch.setattr(_AgentGEPAAdapter, "evaluate", explode) + + result = await optimizer.run() + assert result.status == "FAILED" + assert result.finish_reason == "error" + assert "evaluator exploded during baseline" in result.error_message + assert result.best_prompts == result.baseline_prompts + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_required_metrics_passing( + tmp_path, monkeypatch +): + """When the framework stopper fires (its last_triggered flips True before + gepa returns), run() must persist stop_reason='required_metrics_passing'.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.9], + total_metric_calls=15, + ) + + async def fake_call_gepa(self, **kwargs): + for s in kwargs["stop_callbacks"]: + if isinstance(s, _RequiredMetricsAboveThresholdStopper): + s.update({"m1": 0.9}) + s(gepa_state=None) + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "required_metrics_passing" + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_completed_when_no_stopper_fires( + tmp_path, monkeypatch +): + """When gepa returns without firing any wrapped stopper (mock path), + stop_reason must be 'completed' rather than the legacy 'budget_exhausted' + catch-all so users can tell apart "loop drained naturally" from a real + budget cap hit.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.6], + ) + + async def fake_call_gepa(self, **kwargs): + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "completed" + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_no_improvement_when_that_stopper_fires( + tmp_path, monkeypatch +): + """When the wrapped NoImprovementStopper signals last_triggered (by gepa + polling it past the configured patience), stop_reason must be + 'no_improvement' so reporters and summary.txt can attribute the stop + correctly instead of falsely blaming the budget.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(max_iterations_without_improvement=3), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.6], + ) + + async def fake_call_gepa(self, **kwargs): + for stopper in kwargs["stop_callbacks"]: + if isinstance(stopper, _LabeledStopper) and stopper.label == "no_improvement": + stopper.last_triggered = True + break + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "no_improvement" + + +@pytest.mark.asyncio +async def test_optimizer_run_stop_reason_budget_exhausted_when_max_metric_calls_fires( + tmp_path, monkeypatch +): + """When MaxMetricCallsStopper is the only fired wrapper, stop_reason is + 'budget_exhausted'. This locks the label mapping for the legacy + catch-all so a budget cap hit still carries the historical name users + see in reports.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.6], + ) + + async def fake_call_gepa(self, **kwargs): + for stopper in kwargs["stop_callbacks"]: + if isinstance(stopper, _LabeledStopper) and stopper.label == "budget_exhausted": + stopper.last_triggered = True + break + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + result = await optimizer.run() + assert result.status == "SUCCEEDED" + assert result.stop_reason == "budget_exhausted" + + +def test_labeled_stopper_records_last_triggered_only_when_inner_returns_true(): + """``_LabeledStopper.__call__`` delegates the return value to the inner + stopper and flips ``last_triggered`` sticky once the inner ever returns + True; subsequent False results never clear the flag.""" + calls: list[bool] = [] + + class _ScriptedInner: + def __call__(self, *_args, **_kwargs): + return calls.pop(0) + + wrapper = _LabeledStopper(_ScriptedInner(), "no_improvement") + assert wrapper.label == "no_improvement" + assert wrapper.last_triggered is False + + calls.extend([False, True, False]) + assert wrapper() is False + assert wrapper.last_triggered is False + assert wrapper() is True + assert wrapper.last_triggered is True + assert wrapper() is False + assert wrapper.last_triggered is True + + +def test_build_stop_callbacks_wraps_each_gepa_stopper_with_a_labeled_stopper(): + """Every algorithm-side stop knob the user enables must end up wrapped in + a ``_LabeledStopper`` carrying the matching label, so the optimizer can + classify ``stop_reason`` precisely after gepa returns.""" + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(provider_name="openai", model_name="m"), + max_metric_calls=10, + max_iterations_without_improvement=3, + timeout_seconds=60.0, + score_threshold=0.95, + max_candidate_proposals=5, + max_tracked_candidates=4, + ) + stop_callbacks, _framework = _build_stop_callbacks( + algo=algo, + stop_config=FrameworkStopConfig(required_metrics=None), + metric_thresholds={"m1": 1.0}, + ) + labels = { + s.label + for s in stop_callbacks + if isinstance(s, _LabeledStopper) + } + assert labels == { + "budget_exhausted", + "no_improvement", + "timeout", + "score_threshold", + "max_candidate_proposals", + "max_tracked_candidates", + } + + +def test_classify_stop_reason_prefers_framework_stopper_over_labeled_ones(): + """When both the framework stopper and a labeled gepa stopper fired in + the same run, ``required_metrics_passing`` wins because it represents + the user's explicit opt-in stop policy.""" + framework = _RequiredMetricsAboveThresholdStopper({"m": 0.5}) + framework.last_triggered = True + labeled = _LabeledStopper(lambda *_: False, "no_improvement") + labeled.last_triggered = True + assert ( + _classify_stop_reason( + stop_callbacks=[labeled, framework], + framework_stopper=framework, + ) + == "required_metrics_passing" + ) + + +def test_classify_stop_reason_returns_completed_when_no_stopper_fires(): + """No stopper triggered ⇒ gepa loop ended naturally. The ``completed`` + label distinguishes this from any real stop cap so users can tell the + difference in summary.txt and the terminal banner.""" + framework = _RequiredMetricsAboveThresholdStopper({"m": 0.5}) + labeled = _LabeledStopper(lambda *_: False, "timeout") + assert ( + _classify_stop_reason( + stop_callbacks=[labeled, framework], + framework_stopper=framework, + ) + == "completed" + ) + + +@pytest.mark.asyncio +async def test_optimizer_run_wires_stopper_update_into_callback( + tmp_path, monkeypatch +): + """The callback must receive the stopper's update as on_valset_breakdown so + in a real gepa run the stopper's _latest tracks the most recent valset.""" + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=_new_target_prompt(), + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + fake_gepa_result = _FakeGEPAResult( + candidates=[{"instruction": "initial"}, {"instruction": "improved"}], + val_aggregate_scores=[0.5, 0.9], + ) + captured: dict = {} + + async def fake_call_gepa(self, **kwargs): + captured["stop_callbacks"] = kwargs["stop_callbacks"] + captured["gepa_callback"] = kwargs["callbacks"][0] + return fake_gepa_result + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + await optimizer.run() + stopper = next( + s + for s in captured["stop_callbacks"] + if isinstance(s, _RequiredMetricsAboveThresholdStopper) + ) + gepa_callback = captured["gepa_callback"] + assert gepa_callback._on_valset_breakdown == stopper.update + + +@pytest.mark.asyncio +async def test_optimizer_run_returns_failed_when_gepa_raises(tmp_path, monkeypatch): + train_evalset = EvalSet(eval_set_id="train", eval_cases=[_eval_case("c1")]) + val_evalset = EvalSet(eval_set_id="val", eval_cases=[_eval_case("c1")]) + train_path = tmp_path / "train.json" + val_path = tmp_path / "val.json" + train_path.write_text(train_evalset.model_dump_json(), encoding="utf-8") + val_path.write_text(val_evalset.model_dump_json(), encoding="utf-8") + + recorder: dict[str, str] = {"instruction": "initial"} + target = _new_target_prompt(recorder) + optimizer = GepaReflectiveOptimizer( + config=_make_config(), + call_agent=_stub_call_agent, + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + ) + + async def fake_call_gepa(self, **kwargs): + raise RuntimeError("simulated gepa failure") + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa) + + result = await optimizer.run() + + assert result.status == "FAILED" + assert result.finish_reason == "error" + assert "simulated gepa failure" in result.error_message + assert recorder["instruction"] == "initial" + + +def test_stop_reason_literal_includes_user_requested_stop() -> None: + from typing import get_args + + from trpc_agent_sdk.evaluation._optimize_result import StopReason + + assert "user_requested_stop" in get_args(StopReason) + + +def test_optimizer_constructor_stores_output_dir(tmp_path) -> None: + """BaseOptimizer surfaces output_dir so subclasses can wire FileStopper.""" + config = OptimizeConfigFile( + evaluate=EvalConfig( + metrics=[{"metric_name": "m", "threshold": 0.5}], + num_runs=1, + ), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ), + ), + ) + + async def _call_agent(_q: str) -> str: + return "" + + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(tmp_path / "t.json"), + validation_dataset_path=str(tmp_path / "v.json"), + output_dir=str(tmp_path / "runs/x"), + ) + + assert opt.output_dir == str(tmp_path / "runs/x") + + +def test_build_stop_callbacks_installs_file_stopper_when_output_dir_set(tmp_path) -> None: + """When output_dir is provided, FileStopper labels new stops as user_requested_stop.""" + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=10, + ) + + callbacks, _ = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + metric_thresholds={}, + output_dir=str(tmp_path), + ) + + labels = [cb.label for cb in callbacks if isinstance(cb, _LabeledStopper)] + assert "user_requested_stop" in labels + + +def test_file_stopper_fires_after_optimize_stop_file_appears(tmp_path) -> None: + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=10, + ) + + callbacks, _ = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + metric_thresholds={}, + output_dir=str(tmp_path), + ) + stopper = next( + cb for cb in callbacks + if isinstance(cb, _LabeledStopper) + and cb.label == "user_requested_stop" + ) + + assert stopper(gepa_state=None) is False + (tmp_path / "optimize.stop").write_text("", encoding="utf-8") + assert stopper(gepa_state=None) is True + assert stopper.last_triggered is True + + +def test_build_stop_callbacks_skips_file_stopper_when_output_dir_none() -> None: + algo = GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=10, + ) + + callbacks, _ = _build_stop_callbacks( + algo, + FrameworkStopConfig(required_metrics=None), + metric_thresholds={}, + output_dir=None, + ) + + labels = [cb.label for cb in callbacks if isinstance(cb, _LabeledStopper)] + assert "user_requested_stop" not in labels + + +def test_run_forwards_reflection_history_top_k_into_adapter(tmp_path, monkeypatch): + """algo.reflection_history_top_k must reach the adapter constructor as top_k_per_case.""" + import asyncio + import json + from types import SimpleNamespace + + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._optimize_config import ( + FrameworkStopConfig, + GepaReflectiveAlgo, + OptimizeConfig, + OptimizeConfigFile, + ) + from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + GepaReflectiveOptimizer, + ) + from trpc_agent_sdk.evaluation._optimize_model_options import ( + OptimizeModelOptions, + ) + from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + async def _call_agent(_q: str) -> str: + return "" + + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + train_path = tmp_path / "t.json" + val_path = tmp_path / "v.json" + train_path.write_text( + json.dumps({"eval_set_id": "t", "eval_cases": []}), encoding="utf-8" + ) + val_path.write_text( + json.dumps({"eval_set_id": "v", "eval_cases": []}), encoding="utf-8" + ) + + captured_kwargs: dict = {} + + def fake_init(self, **kwargs): + captured_kwargs.update(kwargs) + self.target_prompt = kwargs["target_prompt"] + self.eval_config = kwargs["eval_config"] + self.call_agent = kwargs["call_agent"] + self.callbacks = kwargs.get("callbacks") + self.num_runs = kwargs.get("num_runs", 1) + self.case_parallelism = kwargs.get("case_parallelism") + self._top_k = int(kwargs.get("top_k_per_case", 0)) + self._best_history = {} + self.last_outcome = None + + monkeypatch.setattr( + "trpc_agent_sdk.evaluation._optimize_gepa_adapter._AgentGEPAAdapter.__init__", + fake_init, + ) + + async def _fake_call(self, **kwargs): + return SimpleNamespace( + best_idx=0, + candidates=[{"p": "seed"}], + val_aggregate_scores=[0.5], + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", _fake_call) + + config = OptimizeConfigFile( + evaluate=EvalConfig(metrics=[{"metric_name": "m", "threshold": 0.5}]), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + reflection_history_top_k=3, + ), + ), + ) + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=None, + ) + asyncio.run(opt.run()) + + assert captured_kwargs["top_k_per_case"] == 3 + + +def test_optimizer_constructor_stores_extra_callbacks(tmp_path) -> None: + """BaseOptimizer.__init__ must accept and store extra_stop/gepa_callbacks.""" + import json + + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._optimize_config import ( + FrameworkStopConfig, + GepaReflectiveAlgo, + OptimizeConfig, + OptimizeConfigFile, + ) + from trpc_agent_sdk.evaluation._optimize_model_options import ( + OptimizeModelOptions, + ) + from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + async def _call_agent(_q: str) -> str: + return "" + + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + train_path = tmp_path / "t.json" + val_path = tmp_path / "v.json" + train_path.write_text( + json.dumps({"eval_set_id": "t", "eval_cases": []}), encoding="utf-8" + ) + val_path.write_text( + json.dumps({"eval_set_id": "v", "eval_cases": []}), encoding="utf-8" + ) + + config = OptimizeConfigFile( + evaluate=EvalConfig(metrics=[{"metric_name": "m", "threshold": 0.5}]), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ), + ), + ) + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + + def sentinel_stopper(gepa_state=None): + return False + + sentinel_callback = object() + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(tmp_path / "runs/x"), + extra_stop_callbacks=[sentinel_stopper], + extra_gepa_callbacks=[sentinel_callback], + ) + + assert sentinel_stopper in opt.extra_stop_callbacks + assert sentinel_callback in opt.extra_gepa_callbacks + + +def test_run_extends_stop_callbacks_with_user_supplied_extras(tmp_path, monkeypatch): + """User-supplied extras must be appended to stop_callbacks and the callbacks list.""" + import asyncio + import json + from types import SimpleNamespace + + from trpc_agent_sdk.evaluation._eval_config import EvalConfig + from trpc_agent_sdk.evaluation._optimize_config import ( + FrameworkStopConfig, + GepaReflectiveAlgo, + OptimizeConfig, + OptimizeConfigFile, + ) + from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + GepaReflectiveOptimizer, + ) + from trpc_agent_sdk.evaluation._optimize_model_options import ( + OptimizeModelOptions, + ) + from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + async def _call_agent(_q: str) -> str: + return "" + + (tmp_path / "p.md").write_text("seed", encoding="utf-8") + train_path = tmp_path / "t.json" + val_path = tmp_path / "v.json" + train_path.write_text( + json.dumps({"eval_set_id": "t", "eval_cases": []}), encoding="utf-8" + ) + val_path.write_text( + json.dumps({"eval_set_id": "v", "eval_cases": []}), encoding="utf-8" + ) + + def sentinel_stopper_a(gepa_state=None): + return False + + def sentinel_stopper_b(gepa_state=None): + return False + + sentinel_callback = SimpleNamespace(tag="user-cb") + + captured: dict = {} + + async def _fake_call(self, **kwargs): + captured["stop_callbacks"] = list(kwargs.get("stop_callbacks") or []) + captured["callbacks"] = list(kwargs.get("callbacks") or []) + return SimpleNamespace( + best_idx=0, + candidates=[{"p": "seed"}], + val_aggregate_scores=[0.5], + ) + + monkeypatch.setattr(GepaReflectiveOptimizer, "_call_gepa_optimize", _fake_call) + + config = OptimizeConfigFile( + evaluate=EvalConfig(metrics=[{"metric_name": "m", "threshold": 0.5}]), + optimize=OptimizeConfig( + stop=FrameworkStopConfig(required_metrics=None), + algorithm=GepaReflectiveAlgo( + name="gepa_reflective", + reflection_lm=OptimizeModelOptions(), + max_metric_calls=1, + ), + ), + ) + target_prompt = TargetPrompt().add_path("p", str(tmp_path / "p.md")) + + opt = GepaReflectiveOptimizer( + config=config, + call_agent=_call_agent, + target_prompt=target_prompt, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=None, + extra_stop_callbacks=[sentinel_stopper_a, sentinel_stopper_b], + extra_gepa_callbacks=[sentinel_callback], + ) + asyncio.run(opt.run()) + + assert sentinel_stopper_a in captured["stop_callbacks"] + assert sentinel_stopper_b in captured["stop_callbacks"] + assert sentinel_callback in captured["callbacks"] diff --git a/tests/evaluation/test_optimize_metric_info.py b/tests/evaluation/test_optimize_metric_info.py new file mode 100644 index 0000000..c29ef52 --- /dev/null +++ b/tests/evaluation/test_optimize_metric_info.py @@ -0,0 +1,630 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for the optimize-side metric reference doc builder. + +The doc is the static "syllabus" injected into the reflection LM's prompt +template alongside the dynamic per-case feedback. Every code path tested +here describes a knob the user can turn in optimizer.json's +``evaluate.metrics[]`` array, and the doc must render that knob so the +reflection LM understands how the metric scores its rewrites. +""" + +from __future__ import annotations + +import math + +import pytest + +from trpc_agent_sdk.evaluation._eval_config import EvalConfig +from trpc_agent_sdk.evaluation._optimize_metric_info import ( + build_metric_reference_doc, + build_metric_section, + build_reflection_prompt_template, +) + + +def _config_with(metric_dicts: list[dict]) -> EvalConfig: + """Wrap a list of metric dicts into an EvalConfig (Pydantic round-trip safe).""" + return EvalConfig(metrics=metric_dicts, num_runs=1) + + +# -------- Exclusion rules -------- + + +def test_skip_tool_trajectory_metric(): + cfg = _config_with([ + {"metric_name": "tool_trajectory_avg_score", "threshold": 1.0}, + ]) + doc = build_metric_reference_doc(cfg) + assert "tool_trajectory_avg_score" not in doc + + +def test_skip_rouge_metric(): + cfg = _config_with([ + {"metric_name": "response_match_score", "threshold": 0.5}, + ]) + doc = build_metric_reference_doc(cfg) + assert "response_match_score" not in doc + + +def test_empty_metrics_renders_placeholder(): + cfg = _config_with([]) + doc = build_metric_reference_doc(cfg) + assert doc.strip() # non-empty header at minimum + + +# -------- final_response_avg_score -------- + + +def test_final_response_text_contains_case_insensitive(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": True} + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "final_response_avg_score" in doc + assert "contains" in doc + assert "case-insensitive" in doc.lower() + assert "1.0000" in doc # threshold rendered + + +def test_final_response_text_exact_case_sensitive(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }]) + doc = build_metric_reference_doc(cfg) + assert "exact" in doc + assert "case-sensitive" in doc.lower() + + +def test_final_response_text_regex_mode(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "regex"}}}, + }]) + doc = build_metric_reference_doc(cfg) + assert "regex" in doc + assert "re.search" in doc or "regular expression" in doc.lower() + + +def test_final_response_text_ignored(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact", "ignore": True}}}, + }]) + doc = build_metric_reference_doc(cfg) + # ignore=True means text comparison is skipped + assert "skipped" in doc.lower() or "ignore" in doc.lower() + + +def test_final_response_json_with_ignore_tree_and_tolerance(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "json": { + "ignore_tree": {"id": True, "meta": {"ts": True}}, + "number_tolerance": 0.001, + } + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "JSON" in doc + assert "ignore_tree" in doc or "ignored" in doc.lower() + assert "0.001" in doc + + +def test_final_response_text_and_json_combined_uses_and_logic(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": { + "text": {"match": "exact"}, + "json": {"number_tolerance": 0.01}, + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "AND" in doc or "both" in doc.lower() + + +def test_final_response_custom_compare_overrides_text_and_json(): + """When the user registers a custom compare via EVALUATOR_REGISTRY, + the doc must explicitly tell the reflection LM that text/json + strategies are overridden by user code.""" + from trpc_agent_sdk.evaluation._evaluator_registry import EVALUATOR_REGISTRY + + def my_compare(actual, expected): # pragma: no cover - registered then removed + return True + + EVALUATOR_REGISTRY.set_criterion_compare("final_response_avg_score", my_compare) + try: + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }]) + doc = build_metric_reference_doc(cfg) + assert "custom" in doc.lower() + assert "override" in doc.lower() + finally: + # cleanup: this is a global registry, leaking would affect later tests + EVALUATOR_REGISTRY._criterion_compares.pop("final_response_avg_score", None) + + +# -------- llm_rubric_response -------- + + +def test_llm_rubric_single_judge_with_rubrics(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "glm-5.1-w4afp8", + "num_samples": 1, + "generation_config": {"max_tokens": 1024, "temperature": 0.2}, + }, + "rubrics": [ + { + "id": "numeric_correct", + "description": "ę•°å­—ē­”ę”ˆäøŽå‚č€ƒē­”ę”ˆäø€č‡“", + "content": {"text": "ęœ€ē»ˆē»™å‡ŗēš„ę•°å­—ē­”ę”ˆę˜Æå¦äøŽå‚č€ƒē­”ę”ˆäø€č‡“ć€‚"}, + }, + { + "id": "reasoning_clear", + "description": "ęŽØē†ę­„éŖ¤ęø…ę™°", + "content": {"text": "å›žē­”äø­ę˜Æå¦ē»™å‡ŗęø…ę™°ēš„ęŽØē†čæ‡ēØ‹ć€‚"}, + }, + ], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "llm_rubric_response" in doc + assert "glm-5.1-w4afp8" in doc + assert "numeric_correct" in doc + assert "ę•°å­—ē­”ę”ˆäøŽå‚č€ƒē­”ę”ˆäø€č‡“" in doc + assert "reasoning_clear" in doc + assert "0.6600" in doc + # judge config fields surfaced + assert "temperature=0.2" in doc + assert "max_tokens=1024" in doc + + +def test_llm_rubric_multi_judge_with_weighted_avg(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_models": [ + {"model_name": "judge-A", "weight": 2.0}, + {"model_name": "judge-B", "weight": 1.0}, + ], + "models_aggregator": "weighted_avg", + "parallel": True, + "rubrics": [ + {"id": "r1", "description": "d1", "content": {"text": "rubric text 1"}}, + ], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "judge-A" in doc + assert "judge-B" in doc + assert "weight=2.0" in doc + assert "weighted_avg" in doc + assert "parallel" in doc.lower() + + +@pytest.mark.parametrize("aggregator,must_contain", [ + ("all_pass", "all"), + ("any_pass", "any"), + ("majority_pass", "majority"), + ("avg", "mean"), + ("weighted_avg", "weighted"), + ("weighted_majority", "weighted"), +]) +def test_each_aggregator_has_explanation(aggregator, must_contain): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_models": [ + {"model_name": "j1", "weight": 1.0}, + {"model_name": "j2", "weight": 1.0}, + ], + "models_aggregator": aggregator, + "rubrics": [{"id": "r1", "description": "d", "content": {"text": "x"}}], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert aggregator in doc + assert must_contain.lower() in doc.lower() + + +def test_llm_rubric_threshold_translates_to_min_pass_count(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [ + {"id": f"r{i}", "description": "d", "content": {"text": "x"}} + for i in range(3) + ], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + # 0.66 * 3 = 1.98 -> ceil = 2; reflection LM needs to see this concretely + min_pass = math.ceil(0.66 * 3) + assert str(min_pass) in doc + + +# -------- llm_rubric_knowledge_recall -------- + + +def test_llm_rubric_knowledge_recall_renders_tool_names(): + cfg = _config_with([{ + "metric_name": "llm_rubric_knowledge_recall", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "kr1", "description": "d", "content": {"text": "k"}}], + "knowledge_tool_names": ["search_docs", "retrieve_chunks"], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "search_docs" in doc + assert "retrieve_chunks" in doc + assert "knowledge" in doc.lower() + + +def test_llm_rubric_knowledge_recall_default_tools_noted_when_unset(): + cfg = _config_with([{ + "metric_name": "llm_rubric_knowledge_recall", + "threshold": 0.5, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "kr1", "description": "d", "content": {"text": "k"}}], + } + }, + }]) + doc = build_metric_reference_doc(cfg) + # default knowledge tool set should be mentioned + assert "default" in doc.lower() + + +# -------- llm_final_response -------- + + +def test_llm_final_response_binary_judge(): + cfg = _config_with([{ + "metric_name": "llm_final_response", + "threshold": 1.0, + "criterion": { + "llm_judge": { + "judge_model": {"model_name": "j1"}, + } + }, + }]) + doc = build_metric_reference_doc(cfg) + assert "llm_final_response" in doc + assert "binary" in doc.lower() or "valid" in doc.lower() + + +# -------- Cross-cutting -------- + + +def test_metrics_listed_in_user_configured_order(): + cfg = _config_with([ + { + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": {"llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "r1", "description": "d", "content": {"text": "x"}}], + }}, + }, + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }, + ]) + doc = build_metric_reference_doc(cfg) + assert doc.index("llm_rubric_response") < doc.index("final_response_avg_score") + + +def test_doc_contains_per_case_feedback_field_list(): + cfg = _config_with([{ + "metric_name": "llm_rubric_response", + "threshold": 0.5, + "criterion": {"llm_judge": { + "judge_model": {"model_name": "j1"}, + "rubrics": [{"id": "r1", "description": "d", "content": {"text": "x"}}], + }}, + }]) + doc = build_metric_reference_doc(cfg) + # rubric metric must tell the LM that rubric_scores appear in per-case feedback + assert "rubric_scores" in doc + assert "reason" in doc + + +def test_doc_contains_rewriting_guidelines_section(): + cfg = _config_with([{ + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": {"final_response": {"text": {"match": "exact"}}}, + }]) + doc = build_metric_reference_doc(cfg) + # the footer "rewriting rules" is essential — it tells the LM how to use + # the per-metric info above when proposing changes + assert "Rewriting" in doc or "Guideline" in doc or "Preserve" in doc + + +def test_build_metric_section_returns_markdown_for_single_metric(): + from trpc_agent_sdk.evaluation._eval_metrics import EvalMetric + + metric = EvalMetric( + metric_name="final_response_avg_score", + threshold=1.0, + criterion={"final_response": {"text": {"match": "contains"}}}, + ) + section = build_metric_section(metric) + assert "final_response_avg_score" in section + assert "contains" in section + assert "1.0000" in section + + +def test_quickstart_config_renders_complete_doc(): + """End-to-end smoke test using a close clone of quickstart/optimizer.json.""" + cfg = _config_with([ + { + "metric_name": "final_response_avg_score", + "threshold": 1.0, + "criterion": { + "final_response": {"text": {"match": "contains", "case_insensitive": True}} + }, + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": {"llm_judge": { + "judge_model": { + "model_name": "glm-5.1-w4afp8", + "num_samples": 1, + "generation_config": {"max_tokens": 1024, "temperature": 0.2}, + }, + "rubrics": [ + {"id": "numeric_correct", "description": "ę•°å­—ē­”ę”ˆäøŽå‚č€ƒē­”ę”ˆäø€č‡“", + "content": {"text": "ęœ€ē»ˆē»™å‡ŗēš„ę•°å­—ē­”ę”ˆę˜Æå¦äøŽå‚č€ƒē­”ę”ˆäø€č‡“ć€‚"}}, + {"id": "reasoning_clear", "description": "ęŽØē†ę­„éŖ¤ęø…ę™°", + "content": {"text": "å›žē­”äø­ę˜Æå¦ē»™å‡ŗęø…ę™°ć€åÆčæ½ęŗÆēš„ęŽØē†ęˆ–č®”ē®—ę­„éŖ¤ć€‚"}}, + {"id": "units_present", "description": "ē­”ę”ˆåø¦ę­£ē”®å•ä½", + "content": {"text": "ęœ€ē»ˆę•°å­—ē­”ę”ˆę˜Æå¦åø¦ęœ‰ę­£ē”®ēš„å•ä½ć€‚"}}, + ], + }}, + }, + ]) + doc = build_metric_reference_doc(cfg) + + # Both metrics surface + assert "final_response_avg_score" in doc + assert "llm_rubric_response" in doc + + # final_response_avg_score config knobs + assert "contains" in doc + assert "case-insensitive" in doc.lower() + + # llm_rubric_response judge config + assert "glm-5.1-w4afp8" in doc + assert "temperature=0.2" in doc + + # All three rubrics with their bodies + for rid in ("numeric_correct", "reasoning_clear", "units_present"): + assert rid in doc + + # Thresholds rendered + assert "1.0000" in doc + assert "0.6600" in doc + + # Min-pass count for rubric metric (ceil(0.66 * 3) = 2) + assert " 2" in doc or "2 " in doc + + +# -------- build_reflection_prompt_template -------- + + +def test_reflection_prompt_template_keeps_required_placeholders(): + """GEPA validates the template — both and + must remain or gepa.optimize raises.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "" in template + assert "" in template + + +def test_reflection_prompt_template_embeds_metric_doc_between_placeholders(): + metric_doc = "## Metrics Reference\n\nMARKER_FOR_TEST\n" + template = build_reflection_prompt_template(metric_doc) + assert "MARKER_FOR_TEST" in template + # placement: metric doc sits AFTER (current prompt) so the LM + # has the current text first, then learns the metrics, then sees feedback + assert template.index("") < template.index("MARKER_FOR_TEST") + assert template.index("MARKER_FOR_TEST") < template.index("") + + +def test_reflection_prompt_template_handles_empty_metric_doc(): + """When metric_doc is empty (no eligible metrics), template still must be + a valid GEPA template — placeholders intact, no spurious markdown.""" + template = build_reflection_prompt_template("") + assert "" in template + assert "" in template + # GEPA will validate; no exception means template is well-formed + + +def test_reflection_prompt_template_does_not_inline_describe_self_evident_fields(): + """GEPA's prompt_renderer emits every record-dict key as ``## `` + markdown header automatically. For keys whose meaning is self-evident + from the header alone (``case_id`` — obviously an identifier), our + static template must NOT re-narrate them ahead of ````. + + The template is allowed (and expected) to keep semantic guidance GEPA + cannot infer from markdown alone: the score's [0, 1] aggregate range, + the ``Case Body`` inner turn-sliced format, the ``Tool Trace`` line + grammar, and the ``Other Active Components`` cross-component context. + """ + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + pre_side_info = template.split("", 1)[0] + + # case_id should be left fully self-evident — the header name says it + # all, no narration needed. + forbidden_phrases = ( + "stable identifier for the case", + "stable id for the case", + "unique id for the case", + ) + for phrase in forbidden_phrases: + assert phrase not in pre_side_info, ( + f"static template still inline-describes a self-evident field " + f"via phrase {phrase!r}; GEPA's auto-rendered ``## case_id`` " + f"header already conveys this — remove the narration" + ) + + +def test_reflection_prompt_template_documents_score_aggregate_range(): + """``score`` is the case-level aggregate on [0, 1] — not a per-metric + score and not the threshold. The template must clarify this so the LM + does not confuse the case score with the per-metric scores inside the + Verdict lines.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + pre_side_info = template.split("", 1)[0] + assert "[0, 1]" in pre_side_info + assert "case-level" in pre_side_info.lower() or "case level" in pre_side_info.lower() + + +def test_reflection_prompt_template_documents_case_body_turn_layout(): + """``Case Body`` is a free-text markdown string; GEPA dumps it as-is. + The static template must spell out the ``### Turn N`` header layout, + the ``**User**``/``**Expected**``/``**Agent Response**``/``**Verdict**`` + field markers, and the per-metric line grammar — otherwise the LM has + to reverse-engineer the convention from raw text.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "### Turn N" in template + assert "**User**" in template + assert "**Expected**" in template + assert "**Agent Response**" in template + assert "**Verdict**" in template + assert "[PASSED|FAILED]" in template + assert "threshold=" in template + assert "rubric[" in template + + +def test_reflection_prompt_template_documents_multi_run_nested_run_blocks(): + """Multi-run cases nest ``#### Run N`` inside each turn; the template + must announce this layout up front so the LM knows variance is + attributable per run rather than averaged out.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "#### Run" in template + assert "num_runs" in template.lower() or "multi-run" in template.lower() + + +def test_reflection_prompt_template_documents_tool_trace_line_grammar(): + """``Tool Trace`` lines are rendered inline (``func(arg=val) → result + [id=...]``) instead of nested dict headers — the template must + document the line grammar because GEPA's renderer cannot infer it.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + assert "Tool Trace" in template + # The line skeleton must be visible so the LM knows how to parse it. + assert "fn_name" in template or "" in template + assert "→" in template + assert "[id=" in template + + +def test_reflection_prompt_template_documents_other_active_components_semantics(): + """``Other Active Components`` is the cross-component context: every + OTHER prompt's current body, present iff the candidate has more than + one prompt. The template must explain that: + - the LM only sees the target prompt at the top of the message + - the verdict came from ALL prompts running together + so the LM uses these contents to avoid duplication and contradiction. + The template must NOT mention ```` by name because GEPA's + prompt_renderer substitutes that placeholder everywhere it appears + in the template, leaking the prompt text into the documentation.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + pre_side_info = template.split("", 1)[0] + assert "Other Active Components" in pre_side_info + # Regression guard: never document ```` by name in the + # static template, otherwise it gets substituted into garbage. + assert "" not in pre_side_info.replace( + "```\n\n```", "" + ).replace("", "", 1) or True # placeholder usage is fine + # The actual regression assertion: the substring shouldn't appear + # twice in the pre-side-info region (once for placeholder, never in narration). + assert pre_side_info.count("") == 1, ( + "```` should appear exactly once in the template " + "(the placeholder itself); referencing it in narration causes " + "GEPA's prompt_renderer to leak the prompt text into the docs." + ) + # The cross-component intent must surface, regardless of exact wording. + lowered = pre_side_info.lower() + assert ( + "avoid restating" in lowered + or "avoid contradicting" in lowered + or "all prompts" in lowered + ) + + +def test_reflection_prompt_template_warns_against_regressing_passing_metrics(): + """A rewrite that fixes a FAILING metric but regresses a PASSING one + is a regression, not progress. The template must surface this rule so + the LM treats PASSING metrics as hard constraints, not noise.""" + template = build_reflection_prompt_template("## Metrics Reference\n\n_dummy_") + lowered = template.lower() + assert ( + "passing metrics stay passing" in lowered + or "passing metrics as constraints" in lowered + or "regressing a passing" in lowered + ) + + +def test_reflection_prompt_template_documents_history_top_k() -> None: + """The reflection LM must be told how to read history_top_k.""" + from trpc_agent_sdk.evaluation._optimize_metric_info import build_reflection_prompt_template + + template = build_reflection_prompt_template("") + + assert "## history_top_k" in template or "``## history_top_k``" in template + assert "preserve" in template.lower() or "anchor" in template.lower() + + +def test_reflection_prompt_template_explains_history_top_k_is_optional() -> None: + from trpc_agent_sdk.evaluation._optimize_metric_info import build_reflection_prompt_template + + template = build_reflection_prompt_template("") + + assert "present iff" in template or "optional" in template.lower() diff --git a/tests/evaluation/test_optimize_model_callable.py b/tests/evaluation/test_optimize_model_callable.py new file mode 100644 index 0000000..ece8622 --- /dev/null +++ b/tests/evaluation/test_optimize_model_callable.py @@ -0,0 +1,261 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for _OptimizeModelCallable (gepa-compatible LanguageModel wrapper).""" + +from __future__ import annotations + +import inspect +from unittest.mock import MagicMock + +import pytest + +from trpc_agent_sdk.evaluation._optimize_model_callable import _OptimizeModelCallable +from trpc_agent_sdk.evaluation._optimize_model_callable import _build_optimize_generation_config +from trpc_agent_sdk.evaluation._optimize_model_callable import _create_optimize_model +from trpc_agent_sdk.evaluation._optimize_model_callable import _extract_final_text +from trpc_agent_sdk.evaluation._optimize_model_callable import _flatten_messages +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + +def _make_opts(**overrides) -> OptimizeModelOptions: + defaults = { + "provider_name": "openai", + "model_name": "gpt-4o", + "api_key": "test-key", + "base_url": "https://api.example.com", + "generation_config": {"temperature": 0.2, "max_tokens": 100}, + } + defaults.update(overrides) + return OptimizeModelOptions(**defaults) + + +def _stub_event(text: str): + event = MagicMock() + event.is_final_response.return_value = True + part = MagicMock() + part.text = text + part.thought = False + event.content = MagicMock() + event.content.parts = [part] + return event + + +def _install_fake_run_async(instance: _OptimizeModelCallable, return_text: str) -> list[str]: + """Replace ``_run_async`` and record the flattened user_text it received. + + The bound method swap isolates tests from LlmAgent / InvocationContext setup + while still exercising ``_flatten_messages`` via the public ``__call__`` path. + """ + seen: list[str] = [] + + async def fake_run_async(user_text: str) -> str: + seen.append(user_text) + return return_text + + instance._run_async = fake_run_async # type: ignore[method-assign] + return seen + + +def test_flatten_messages_passes_through_string(): + assert _flatten_messages("hello") == "hello" + + +def test_flatten_messages_concatenates_dict_list(): + out = _flatten_messages( + [ + {"role": "system", "content": "you are helpful"}, + {"role": "user", "content": "say hi"}, + ] + ) + assert "you are helpful" in out + assert "say hi" in out + assert "[system]" in out + assert "[user]" in out + + +def test_flatten_messages_handles_content_list_parts(): + out = _flatten_messages( + [{"role": "user", "content": [{"text": "first"}, {"text": "second"}]}] + ) + assert "first" in out + assert "second" in out + + +def test_create_optimize_model_with_openai_provider(): + model = _create_optimize_model(_make_opts(provider_name="openai")) + assert model is not None + assert type(model).__name__ == "OpenAIModel" + + +def test_create_optimize_model_with_empty_provider_uses_openai(): + model = _create_optimize_model(_make_opts(provider_name="")) + assert type(model).__name__ == "OpenAIModel" + + +def test_build_generation_config_returns_tuple_with_thinking_none(): + cfg, thinking_config = _build_optimize_generation_config(_make_opts()) + assert cfg is not None + assert cfg.temperature == 0.2 + assert cfg.max_output_tokens == 100 + assert thinking_config is None + + +def test_build_generation_config_with_think_true_returns_thinking_config(): + opts = _make_opts(think=True) + cfg, thinking_config = _build_optimize_generation_config(opts) + assert thinking_config is not None + assert thinking_config.include_thoughts is True + + +def test_build_generation_config_with_think_false_returns_disabled_thinking(): + opts = _make_opts(think=False) + cfg, thinking_config = _build_optimize_generation_config(opts) + assert thinking_config is not None + assert thinking_config.include_thoughts is False + assert thinking_config.thinking_budget == 0 + + +def test_build_generation_config_uses_defaults_when_generation_config_missing(): + opts = OptimizeModelOptions(model_name="m", api_key="k") + cfg, _ = _build_optimize_generation_config(opts) + assert cfg.max_output_tokens == 4096 + assert cfg.temperature == 0.8 + + +def test_callable_constructor_initialises_total_cost_to_zero(): + instance = _OptimizeModelCallable(_make_opts()) + assert instance.total_cost == 0.0 + + +def test_callable_constructor_initialises_total_calls_to_zero(): + instance = _OptimizeModelCallable(_make_opts()) + assert instance.total_calls == 0 + assert instance.total_token_usage == {"prompt": 0, "completion": 0, "total": 0} + + +def test_callable_increments_total_calls_on_each_invocation(): + instance = _OptimizeModelCallable(_make_opts()) + _install_fake_run_async(instance, "reply") + instance("p1") + instance("p2") + instance("p3") + assert instance.total_calls == 3 + + +def test_callable_accumulate_usage_handles_google_style_attrs(): + instance = _OptimizeModelCallable(_make_opts()) + + class _U: + prompt_token_count = 100 + candidates_token_count = 50 + total_token_count = 150 + + instance._accumulate_usage(_U()) + assert instance.total_token_usage == {"prompt": 100, "completion": 50, "total": 150} + + +def test_callable_accumulate_usage_handles_openai_style_dict(): + instance = _OptimizeModelCallable(_make_opts()) + instance._accumulate_usage({"prompt_tokens": 20, "completion_tokens": 10, "total_tokens": 30}) + instance._accumulate_usage({"prompt_tokens": 5, "completion_tokens": 4, "total_tokens": 9}) + assert instance.total_token_usage == {"prompt": 25, "completion": 14, "total": 39} + + +def test_callable_accumulate_usage_computes_total_when_missing(): + instance = _OptimizeModelCallable(_make_opts()) + instance._accumulate_usage({"prompt_tokens": 7, "completion_tokens": 3}) + assert instance.total_token_usage == {"prompt": 7, "completion": 3, "total": 10} + + +def test_callable_exposes_languagemodel_protocol_surface(): + instance = _OptimizeModelCallable(_make_opts()) + assert callable(instance) + assert hasattr(instance, "total_cost") + assert isinstance(instance.total_cost, float) + + +def test_callable_invokes_agent_with_string_prompt(): + instance = _OptimizeModelCallable(_make_opts()) + seen = _install_fake_run_async(instance, "reply text") + result = instance("any prompt") + assert result == "reply text" + assert seen == ["any prompt"] + + +def test_callable_handles_messages_list_prompt(): + instance = _OptimizeModelCallable(_make_opts()) + seen = _install_fake_run_async(instance, "ok") + result = instance( + [ + {"role": "system", "content": "be helpful"}, + {"role": "user", "content": "hi"}, + ] + ) + assert result == "ok" + assert len(seen) == 1 + flattened = seen[0] + assert "be helpful" in flattened + assert "hi" in flattened + assert "[system]" in flattened + assert "[user]" in flattened + + +def test_callable_run_async_is_coroutine_function(): + instance = _OptimizeModelCallable(_make_opts()) + assert inspect.iscoroutinefunction(instance._run_async) + + +def test_extract_final_text_returns_empty_for_non_final_event(): + event = MagicMock() + event.is_final_response.return_value = False + assert _extract_final_text(event) == "" + + +def test_extract_final_text_returns_empty_when_no_content(): + event = MagicMock() + event.is_final_response.return_value = True + event.content = None + assert _extract_final_text(event) == "" + + +def test_extract_final_text_returns_empty_when_parts_missing(): + event = MagicMock() + event.is_final_response.return_value = True + event.content = MagicMock() + event.content.parts = [] + assert _extract_final_text(event) == "" + + +def test_extract_final_text_skips_thought_parts(): + event = MagicMock() + event.is_final_response.return_value = True + thought = MagicMock() + thought.text = "internal monologue" + thought.thought = True + actual = MagicMock() + actual.text = "user-visible" + actual.thought = False + event.content = MagicMock() + event.content.parts = [thought, actual] + result = _extract_final_text(event) + assert "internal monologue" not in result + assert "user-visible" in result + + +def test_extract_final_text_joins_multiple_non_thought_parts(): + event = MagicMock() + event.is_final_response.return_value = True + a = MagicMock() + a.text = "first" + a.thought = False + b = MagicMock() + b.text = "second" + b.thought = False + event.content = MagicMock() + event.content.parts = [a, b] + result = _extract_final_text(event) + assert "first" in result + assert "second" in result diff --git a/tests/evaluation/test_optimize_model_options.py b/tests/evaluation/test_optimize_model_options.py new file mode 100644 index 0000000..28317ca --- /dev/null +++ b/tests/evaluation/test_optimize_model_options.py @@ -0,0 +1,113 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeModelOptions.""" + +from __future__ import annotations + +import ast +from pathlib import Path + +import pytest + +from trpc_agent_sdk.evaluation._llm_criterion import JudgeModelOptions +from trpc_agent_sdk.evaluation._optimize_model_options import OptimizeModelOptions + + +_MODULE_PATH = ( + Path(__file__).resolve().parents[2] + / "trpc_agent_sdk" + / "evaluation" + / "_optimize_model_options.py" +) + + +def test_default_construction_matches_judge_default_field_values(): + opt = OptimizeModelOptions() + judge = JudgeModelOptions() + expected = { + "provider_name": judge.provider_name, + "model_name": judge.model_name, + "variant": judge.variant, + "base_url": judge.base_url, + "api_key": judge.api_key, + "extra_fields": judge.extra_fields, + "num_samples": judge.num_samples, + "generation_config": judge.generation_config, + "weight": judge.weight, + "think": judge.think, + } + actual = {key: getattr(opt, key) for key in expected} + assert actual == expected + + +def test_field_set_mirrors_judge_field_set(): + optimize_fields = set(OptimizeModelOptions.model_fields.keys()) + judge_fields = set(JudgeModelOptions.model_fields.keys()) + assert optimize_fields == judge_fields, ( + f"OptimizeModelOptions / JudgeModelOptions field set drift: " + f"only in optimize={optimize_fields - judge_fields}, " + f"only in judge={judge_fields - optimize_fields}" + ) + + +def test_is_distinct_class_not_judge_subclass(): + assert OptimizeModelOptions is not JudgeModelOptions + assert not issubclass(OptimizeModelOptions, JudgeModelOptions) + assert not issubclass(JudgeModelOptions, OptimizeModelOptions) + + +def test_module_file_has_no_import_of_llm_criterion(): + source = _MODULE_PATH.read_text(encoding="utf-8") + tree = ast.parse(source) + for node in ast.walk(tree): + if isinstance(node, ast.ImportFrom): + module = node.module or "" + assert "_llm_criterion" not in module, ( + f"_optimize_model_options.py must not import from {module!r}" + ) + if isinstance(node, ast.Import): + for alias in node.names: + assert "_llm_criterion" not in alias.name, ( + f"_optimize_model_options.py must not import {alias.name!r}" + ) + + +def test_json_serialization_uses_camel_alias_like_judge(): + opt = OptimizeModelOptions(model_name="gpt-4o", api_key="k", weight=0.5) + dumped = opt.model_dump(by_alias=True) + assert dumped["modelName"] == "gpt-4o" + assert dumped["apiKey"] == "k" + assert dumped["weight"] == 0.5 + + +def test_construction_accepts_full_field_set(): + opt = OptimizeModelOptions( + provider_name="openai", + model_name="gpt-4o", + variant="responses", + base_url="https://api.example.com", + api_key="sk-abc", + extra_fields={"x": 1}, + num_samples=3, + generation_config={"temperature": 0.2, "max_tokens": 1024}, + weight=0.7, + think=True, + ) + assert opt.provider_name == "openai" + assert opt.model_name == "gpt-4o" + assert opt.variant == "responses" + assert opt.base_url == "https://api.example.com" + assert opt.api_key == "sk-abc" + assert opt.extra_fields == {"x": 1} + assert opt.num_samples == 3 + assert opt.generation_config == {"temperature": 0.2, "max_tokens": 1024} + assert opt.weight == 0.7 + assert opt.think is True + + +def test_extra_fields_rejected_consistent_with_eval_base_model(): + with pytest.raises(Exception): + OptimizeModelOptions(unknown_extra_field="oops") diff --git a/tests/evaluation/test_optimize_quickstart_example.py b/tests/evaluation/test_optimize_quickstart_example.py new file mode 100644 index 0000000..404db81 --- /dev/null +++ b/tests/evaluation/test_optimize_quickstart_example.py @@ -0,0 +1,489 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Smoke tests for ``examples/optimization/quickstart``. + +Goals: + * import the quickstart's ``agent`` package and ``run_optimization`` script + without side effects + * verify env-variable validation in ``agent.config.get_model_config`` + * verify ``agent.create_agent`` reads its instruction from + ``agent/prompts/system.md`` and ``agent/prompts/skill.md`` + * verify the script-level ``call_agent`` is async and exposes a single + ``query`` parameter (the contract the optimizer relies on) + * verify the quickstart's ``optimizer.json`` is a valid + ``OptimizeConfigFile`` and exercises the multi-metric scenario + * verify the end-to-end optimize flow wires together when the reflection + LLM, the gepa main loop, and the LLM judge are all mocked out + +The quickstart's ``agent`` and ``run_optimization`` are loaded by absolute path +because they live outside the python package tree. +""" + +from __future__ import annotations + +import importlib.util +import inspect +import sys +from pathlib import Path +from typing import Any + +import pytest + + +# --------------------------------------------------------------------------- +# Loader helpers (import quickstart files by path without polluting sys.modules) +# --------------------------------------------------------------------------- + + +_QUICKSTART_DIR = ( + Path(__file__).resolve().parents[2] + / "examples" + / "optimization" + / "quickstart" +) + + +def _load_quickstart_agent() -> Any: + """Import ``agent.agent`` from the quickstart example directory.""" + if str(_QUICKSTART_DIR) not in sys.path: + sys.path.insert(0, str(_QUICKSTART_DIR)) + if "agent" in sys.modules: + # ensure we always reimport against the freshly mutated env + for name in [k for k in sys.modules if k == "agent" or k.startswith("agent.")]: + sys.modules.pop(name, None) + import agent.agent as agent_mod # type: ignore + return agent_mod + + +def _load_quickstart_run_module() -> Any: + """Load ``run_optimization.py`` as an importable module without executing main().""" + if str(_QUICKSTART_DIR) not in sys.path: + sys.path.insert(0, str(_QUICKSTART_DIR)) + spec = importlib.util.spec_from_file_location( + "quickstart_run_optimization", + _QUICKSTART_DIR / "run_optimization.py", + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +@pytest.fixture +def fake_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("TRPC_AGENT_API_KEY", "fake-key") + monkeypatch.setenv("TRPC_AGENT_BASE_URL", "http://localhost/fake") + monkeypatch.setenv("TRPC_AGENT_MODEL_NAME", "fake-model") + + +# --------------------------------------------------------------------------- +# Structure / contract +# --------------------------------------------------------------------------- + + +def test_quickstart_directory_layout_matches_expected_structure(): + expected = { + "agent/__init__.py", + "agent/agent.py", + "agent/config.py", + "agent/prompts/system.md", + "agent/prompts/skill.md", + "optimizer.json", + "train.evalset.json", + "val.evalset.json", + "run_optimization.py", + } + for rel in expected: + path = _QUICKSTART_DIR / rel + assert path.exists(), f"missing quickstart file: {rel}" + + +def test_prompt_files_are_non_empty_markdown_files(): + for rel in ("agent/prompts/system.md", "agent/prompts/skill.md"): + text = (_QUICKSTART_DIR / rel).read_text(encoding="utf-8") + assert text.strip(), f"{rel} must not be empty" + + +def test_optimizer_json_declares_multi_metric_and_multi_prompt_setup(): + """The quickstart must showcase a multi-metric configuration so users see + the reporter handle the multi-metric scenario end to end. The judge LLM + metric (``llm_rubric_response``) must carry a populated rubrics list.""" + import json + payload = json.loads((_QUICKSTART_DIR / "optimizer.json").read_text(encoding="utf-8")) + metrics = payload["evaluate"]["metrics"] + assert len(metrics) >= 2, "quickstart should configure 2+ metrics" + names = {m["metric_name"] for m in metrics} + assert "final_response_avg_score" in names + assert "llm_rubric_response" in names + judge_metric = next(m for m in metrics if m["metric_name"] == "llm_rubric_response") + judge_cfg = judge_metric["criterion"]["llm_judge"] + assert judge_cfg.get("judge_model"), "llm_rubric_response must configure judge_model" + rubrics = judge_cfg.get("rubrics") or [] + assert len(rubrics) >= 2, "llm_rubric_response must list at least 2 rubrics" + + +def test_optimizer_json_validates_against_optimize_config_file(): + """Schema-level smoke: the example config must load cleanly via the SDK's + public loader so any breaking schema change surfaces here.""" + from trpc_agent_sdk.evaluation._optimize_config import load_optimize_config + + cfg = load_optimize_config(str(_QUICKSTART_DIR / "optimizer.json")) + metric_names = {m.metric_name for m in cfg.evaluate.get_eval_metrics()} + assert metric_names == {"final_response_avg_score", "llm_rubric_response"} + # Framework-level stop policy defaults to "all" via the example. + assert cfg.optimize.stop.required_metrics == "all" + + +# --------------------------------------------------------------------------- +# agent.config: environment-variable validation +# --------------------------------------------------------------------------- + + +def test_get_model_config_raises_when_env_missing(monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("TRPC_AGENT_API_KEY", raising=False) + monkeypatch.delenv("TRPC_AGENT_BASE_URL", raising=False) + monkeypatch.delenv("TRPC_AGENT_MODEL_NAME", raising=False) + agent_mod = _load_quickstart_agent() + with pytest.raises(ValueError) as exc_info: + agent_mod.get_model_config() + msg = str(exc_info.value) + assert "TRPC_AGENT_API_KEY" in msg + assert "TRPC_AGENT_BASE_URL" in msg + assert "TRPC_AGENT_MODEL_NAME" in msg + + +def test_get_model_config_returns_tuple_when_env_set(fake_env: None): + agent_mod = _load_quickstart_agent() + api_key, base_url, model_name = agent_mod.get_model_config() + assert api_key == "fake-key" + assert base_url == "http://localhost/fake" + assert model_name == "fake-model" + + +# --------------------------------------------------------------------------- +# agent.agent: LlmAgent factory +# --------------------------------------------------------------------------- + + +def test_create_agent_composes_instruction_from_both_prompt_files(fake_env: None): + agent_mod = _load_quickstart_agent() + from trpc_agent_sdk.agents import LlmAgent + + agent_instance = agent_mod.create_agent() + system_text = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() + skill_text = agent_mod.SKILL_PATH.read_text(encoding="utf-8").strip() + assert isinstance(agent_instance, LlmAgent) + assert system_text in agent_instance.instruction + assert skill_text in agent_instance.instruction + assert agent_instance.name == "math_word_problem_agent" + + +def test_create_agent_picks_up_latest_prompt_text(fake_env: None): + """Optimizer-flow sanity: rewriting any of the prompt files must be + visible to the next agent.""" + agent_mod = _load_quickstart_agent() + original_system = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + original_skill = agent_mod.SKILL_PATH.read_text(encoding="utf-8") + try: + agent_mod.SYSTEM_PROMPT_PATH.write_text("UPDATED SYSTEM", encoding="utf-8") + agent_mod.SKILL_PATH.write_text("UPDATED SKILL", encoding="utf-8") + new_agent = agent_mod.create_agent() + assert "UPDATED SYSTEM" in new_agent.instruction + assert "UPDATED SKILL" in new_agent.instruction + finally: + agent_mod.SYSTEM_PROMPT_PATH.write_text(original_system, encoding="utf-8") + agent_mod.SKILL_PATH.write_text(original_skill, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# run_optimization.py: call_agent contract +# --------------------------------------------------------------------------- + + +def test_run_optimization_module_exposes_async_call_agent(fake_env: None): + module = _load_quickstart_run_module() + assert inspect.iscoroutinefunction(module.call_agent), ( + "AgentOptimizer requires call_agent to be an async callable" + ) + sig = inspect.signature(module.call_agent) + params = list(sig.parameters.values()) + assert len(params) == 1 + assert params[0].name == "query" + + +def test_run_optimization_uses_runner_and_inmemory_session_service(fake_env: None): + """The example must build call_agent on top of framework primitives.""" + module = _load_quickstart_run_module() + src = (_QUICKSTART_DIR / "run_optimization.py").read_text(encoding="utf-8") + assert "from trpc_agent_sdk.runners import Runner" in src + assert "from trpc_agent_sdk.sessions import InMemorySessionService" in src + assert "AgentOptimizer.optimize" in src + assert "TargetPrompt" in src + assert hasattr(module, "main") + assert inspect.iscoroutinefunction(module.main) + + +# --------------------------------------------------------------------------- +# End-to-end wiring: optimizer flow with mocked gepa + mocked LLM judge +# --------------------------------------------------------------------------- + + +class _FakeGEPAResult: + def __init__(self, candidates: list[dict], val_scores: list[float]) -> None: + self.candidates = candidates + self.val_aggregate_scores = val_scores + self.parents = [[None]] + [[i - 1] for i in range(1, len(candidates))] + self.discovery_eval_counts = [0] * len(candidates) + self.total_metric_calls = 0 + self.best_outputs_valset = None + + @property + def best_idx(self) -> int: + return max( + range(len(self.val_aggregate_scores)), + key=lambda i: self.val_aggregate_scores[i], + ) + + +@pytest.mark.asyncio +async def test_quickstart_optimize_flow_runs_with_mocked_llm( + tmp_path: Path, + fake_env: None, + monkeypatch: pytest.MonkeyPatch, +): + """Full wiring: AgentOptimizer.optimize → adapter.evaluate → call_agent stub + → mocked gepa → mocked LLM judge → SUCCEEDED OptimizeResult. + + Real LLM calls (reflection_lm + judge_model) are short-circuited so the + test only exercises the framework's plumbing. + """ + from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus + from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + from trpc_agent_sdk.evaluation._optimize_gepa_reflective import ( + GepaReflectiveOptimizer, + ) + + agent_mod = _load_quickstart_agent() + original_system = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + original_skill = agent_mod.SKILL_PATH.read_text(encoding="utf-8") + + # ``stub_call_agent`` returns a string that contains every reference answer + # from train + val, so the ``contains``-based ``final_response_avg_score`` + # accepts every case (baseline_pass_rate is independently zeroed below by + # the gepa stub returning a single seed candidate that passes too). + expected_answers = [ + "ē­”ę”ˆļ¼š11 äøŖ", + "ē­”ę”ˆļ¼š150 公里", + "ē­”ę”ˆļ¼š160 元", + "ē­”ę”ˆļ¼š40 äøŖ", + "ē­”ę”ˆļ¼š3.5 千克", + "ē­”ę”ˆļ¼š18 äŗŗ", + ] + + async def stub_call_agent(query: str) -> str: + return " | ".join(expected_answers) + + async def fake_judge_evaluate(self, actual_invocations, expected_invocations): + """Return a perfect EvaluationResult so llm_rubric_response is always + PASSED without touching a real judge model.""" + from trpc_agent_sdk.evaluation._eval_result import EvaluationResult + from trpc_agent_sdk.evaluation._eval_result import PerInvocationResult + + per_invocation_results = [ + PerInvocationResult( + actual_invocation=actual, + expected_invocation=expected, + score=1.0, + eval_status=EvalStatus.PASSED, + ) + for actual, expected in zip(actual_invocations, expected_invocations) + ] + return EvaluationResult( + overall_score=1.0, + overall_eval_status=EvalStatus.PASSED, + per_invocation_results=per_invocation_results, + ) + + monkeypatch.setattr(LLMJudge, "evaluate", fake_judge_evaluate) + + async def fake_call_gepa(self, **kwargs): + seed = kwargs["seed_candidate"] + improved = dict(seed) + for key in improved: + improved[key] = improved[key] + "\n\nIMPROVED" + return _FakeGEPAResult( + candidates=[seed, improved], + val_scores=[0.0, 1.0], + ) + + monkeypatch.setattr( + GepaReflectiveOptimizer, "_call_gepa_optimize", fake_call_gepa + ) + + try: + target = ( + TargetPrompt() + .add_path("system_prompt", str(agent_mod.SYSTEM_PROMPT_PATH)) + .add_path("skill", str(agent_mod.SKILL_PATH)) + ) + result = await AgentOptimizer.optimize( + config_path=str(_QUICKSTART_DIR / "optimizer.json"), + call_agent=stub_call_agent, + target_prompt=target, + train_dataset_path=str(_QUICKSTART_DIR / "train.evalset.json"), + validation_dataset_path=str(_QUICKSTART_DIR / "val.evalset.json"), + output_dir=str(tmp_path / "quickstart_runs"), + verbose=0, + ) + + assert result.status == "SUCCEEDED" + assert result.algorithm == "gepa_reflective" + assert result.best_pass_rate == pytest.approx(1.0) + # default update_source=False keeps both sources untouched + assert ( + agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") == original_system + ) + assert ( + agent_mod.SKILL_PATH.read_text(encoding="utf-8") == original_skill + ) + # Both registered prompts are present in best_prompts and were rewritten. + assert set(result.best_prompts.keys()) == {"system_prompt", "skill"} + assert "IMPROVED" in result.best_prompts["system_prompt"] + assert "IMPROVED" in result.best_prompts["skill"] + # Artifacts include both best_prompts files (multi-prompt scenario). + best_dir = tmp_path / "quickstart_runs" / "best_prompts" + assert (best_dir / "system_prompt.md").is_file() + assert (best_dir / "skill.md").is_file() + finally: + agent_mod.SYSTEM_PROMPT_PATH.write_text(original_system, encoding="utf-8") + agent_mod.SKILL_PATH.write_text(original_skill, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# CONC-2 fix: real gepa main loop drives adapter.evaluate multiple times, +# verifying the long-lived event loop is shared across rounds and that +# module-level async resources held by call_agent stay valid. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_quickstart_real_gepa_loop_reuses_single_event_loop_across_rounds( + tmp_path: Path, + fake_env: None, + monkeypatch: pytest.MonkeyPatch, +): + """Real gepa.optimize drives adapter.evaluate multiple times. The + adapter's long-lived event loop must be reused across every evaluate + so call_agent can hold module-level async resources safely.""" + import asyncio + import json + + from trpc_agent_sdk.evaluation import AgentOptimizer, TargetPrompt + from trpc_agent_sdk.evaluation._eval_metrics import EvalStatus + from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + from trpc_agent_sdk.evaluation._optimize_model_callable import ( + _OptimizeModelCallable, + ) + + agent_mod = _load_quickstart_agent() + original_system = agent_mod.SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + original_skill = agent_mod.SKILL_PATH.read_text(encoding="utf-8") + + # Track the running loop id every time call_agent fires; must stay + # constant across all evaluate() invocations. + seen_loop_ids: list[int] = [] + + expected_answers = [ + "ē­”ę”ˆļ¼š11 äøŖ", "ē­”ę”ˆļ¼š150 公里", "ē­”ę”ˆļ¼š160 元", + "ē­”ę”ˆļ¼š40 äøŖ", "ē­”ę”ˆļ¼š3.5 千克", "ē­”ę”ˆļ¼š18 äŗŗ", + ] + + async def stub_call_agent(query: str) -> str: + seen_loop_ids.append(id(asyncio.get_running_loop())) + return " | ".join(expected_answers) + + # Make the LLM judge always pass. + async def fake_judge_evaluate(self, actual_invocations, expected_invocations): + from trpc_agent_sdk.evaluation._eval_result import ( + EvaluationResult, + PerInvocationResult, + ) + return EvaluationResult( + overall_score=1.0, + overall_eval_status=EvalStatus.PASSED, + per_invocation_results=[ + PerInvocationResult( + actual_invocation=a, + expected_invocation=e, + score=1.0, + eval_status=EvalStatus.PASSED, + ) + for a, e in zip(actual_invocations, expected_invocations) + ], + ) + + monkeypatch.setattr(LLMJudge, "evaluate", fake_judge_evaluate) + + # Stub reflection LM so gepa main loop doesn't hit a real backend. + # Returns the candidate's instruction with a marker appended each time. + rewrite_count = {"n": 0} + + def fake_reflection_call(self, prompt): + rewrite_count["n"] += 1 + self.total_calls += 1 + return f"REWRITE_v{rewrite_count['n']}" + + monkeypatch.setattr(_OptimizeModelCallable, "__call__", fake_reflection_call) + + # Use a tiny budget so the run finishes quickly but still exercises + # at least baseline + 1 round of adapter.evaluate (=2 evaluate calls + # minimum, in practice baseline + minibatch_eval + valset_eval per + # round = 3+ evaluate calls). + config_path = tmp_path / "optimizer.json" + config_payload = json.loads( + (_QUICKSTART_DIR / "optimizer.json").read_text(encoding="utf-8") + ) + config_payload["optimize"]["algorithm"]["max_metric_calls"] = 6 + config_payload["optimize"]["algorithm"]["reflection_minibatch_size"] = 1 + config_payload["optimize"]["algorithm"]["max_iterations_without_improvement"] = 1 + config_path.write_text(json.dumps(config_payload), encoding="utf-8") + + try: + target = ( + TargetPrompt() + .add_path("system_prompt", str(agent_mod.SYSTEM_PROMPT_PATH)) + .add_path("skill", str(agent_mod.SKILL_PATH)) + ) + result = await AgentOptimizer.optimize( + config_path=str(config_path), + call_agent=stub_call_agent, + target_prompt=target, + train_dataset_path=str(_QUICKSTART_DIR / "train.evalset.json"), + validation_dataset_path=str(_QUICKSTART_DIR / "val.evalset.json"), + output_dir=str(tmp_path / "real_gepa_runs"), + verbose=0, + ) + finally: + agent_mod.SYSTEM_PROMPT_PATH.write_text(original_system, encoding="utf-8") + agent_mod.SKILL_PATH.write_text(original_skill, encoding="utf-8") + + # Real gepa drove adapter.evaluate at least twice (baseline + round 1). + assert len(seen_loop_ids) >= 2, ( + f"Expected real gepa main loop to call call_agent more than once; " + f"saw {len(seen_loop_ids)} call(s)." + ) + + # All call_agent invocations across the entire optimize() ran on the + # same long-lived event loop (CONC-2 fix). + assert len(set(seen_loop_ids)) == 1, ( + f"call_agent ran on multiple distinct loops across rounds: " + f"{set(seen_loop_ids)}. Module-level async resources would break." + ) + + # OptimizeResult is well-formed. + assert result.status in {"SUCCEEDED", "FAILED"} + assert result.algorithm == "gepa_reflective" diff --git a/tests/evaluation/test_optimize_registry.py b/tests/evaluation/test_optimize_registry.py new file mode 100644 index 0000000..ee1938c --- /dev/null +++ b/tests/evaluation/test_optimize_registry.py @@ -0,0 +1,109 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizerRegistry.""" + +from __future__ import annotations + +import pytest + +from trpc_agent_sdk.evaluation._base_optimizer import BaseOptimizer +from trpc_agent_sdk.evaluation._optimize_registry import OPTIMIZER_REGISTRY +from trpc_agent_sdk.evaluation._optimize_registry import OptimizerRegistry +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult + + +def _dummy_result() -> OptimizeResult: + return OptimizeResult( + algorithm="fake", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + total_rounds=0, + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="1970-01-01T00:00:00Z", + finished_at="1970-01-01T00:00:00Z", + ) + + +class _FakeOptimizerA(BaseOptimizer): + async def run(self) -> OptimizeResult: + return _dummy_result() + + +class _FakeOptimizerB(BaseOptimizer): + async def run(self) -> OptimizeResult: + return _dummy_result() + + +def test_empty_registry_lists_nothing(): + registry = OptimizerRegistry() + assert registry.list_registered() == [] + + +def test_register_and_get_returns_class(): + registry = OptimizerRegistry() + registry.register("fake_a", _FakeOptimizerA) + assert registry.get("fake_a") is _FakeOptimizerA + + +def test_list_registered_is_sorted(): + registry = OptimizerRegistry() + registry.register("zzz", _FakeOptimizerA) + registry.register("aaa", _FakeOptimizerB) + assert registry.list_registered() == ["aaa", "zzz"] + + +def test_register_overwrites_existing_name(): + registry = OptimizerRegistry() + registry.register("dup", _FakeOptimizerA) + registry.register("dup", _FakeOptimizerB) + assert registry.get("dup") is _FakeOptimizerB + + +def test_get_unknown_algorithm_raises_valueerror_with_available_list(): + registry = OptimizerRegistry() + registry.register("fake_a", _FakeOptimizerA) + with pytest.raises(ValueError) as exc_info: + registry.get("unknown_algo") + msg = str(exc_info.value) + assert "unknown_algo" in msg + assert "fake_a" in msg + + +def test_get_on_empty_registry_lists_empty_available(): + registry = OptimizerRegistry() + with pytest.raises(ValueError) as exc_info: + registry.get("anything") + assert "anything" in str(exc_info.value) + + +def test_register_rejects_non_basoptimizer_subclass(): + registry = OptimizerRegistry() + + class _NotAnOptimizer: + pass + + with pytest.raises(TypeError): + registry.register("bad", _NotAnOptimizer) + + +def test_module_level_singleton_is_optimizer_registry_instance(): + assert isinstance(OPTIMIZER_REGISTRY, OptimizerRegistry) + + +def test_module_level_singleton_contains_registered_algorithms(): + """Importing the evaluation package registers all available algorithms. + + The exact list grows over time, but ``gepa_reflective`` is the v1 baseline + contract: any algorithm whose optional dependencies are installed and whose + module is registered in ``_optimize_registrations`` must be reachable via + ``OPTIMIZER_REGISTRY.get(name)``. + """ + assert "gepa_reflective" in OPTIMIZER_REGISTRY.list_registered() diff --git a/tests/evaluation/test_optimize_reporter.py b/tests/evaluation/test_optimize_reporter.py new file mode 100644 index 0000000..00b1d2b --- /dev/null +++ b/tests/evaluation/test_optimize_reporter.py @@ -0,0 +1,611 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeReporter progress sinks (Null / Rich / ASCII).""" + +from __future__ import annotations + +import io +import logging +from typing import Any + +import pytest + +from trpc_agent_sdk.evaluation._optimize_reporter import ( + RoundView, + RunHeader, + create_reporter, + _AsciiReporter, + _NullReporter, + _SilentGepaLogger, +) + + +def _header(**overrides: Any) -> RunHeader: + defaults = dict( + algorithm="gepa_reflective", + target_fields=[("instruction", "agent/prompts/system.md")], + train_size=5, + val_size=3, + metric_names=["final_response_avg_score"], + output_dir="runs/2026-05-17T16-30-00", + ) + defaults.update(overrides) + return RunHeader(**defaults) + + +def _round_view(**overrides: Any) -> RoundView: + defaults = dict( + round=1, + kind="reflective", + train_minibatch_size=2, + train_size=5, + train_subsample_parent_score=0.0, + train_subsample_candidate_score=1.0, + val_pass_rate=1.0, + accepted=True, + skip_reason=None, + error_message=None, + duration_seconds=28.4, + budget_used=12, + budget_total=None, # "auto" + ) + defaults.update(overrides) + return RoundView(**defaults) + + +class TestFactory: + def test_verbose_zero_returns_null_reporter(self): + reporter = create_reporter(verbose=0) + assert isinstance(reporter, _NullReporter) + + def test_null_reporter_emits_nothing(self, capsys): + reporter = create_reporter(verbose=0) + reporter.run_started(_header()) + reporter.baseline_evaluated(0.0, {}) + reporter.round_completed(_round_view()) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_verbose_one_picks_a_real_reporter(self): + reporter = create_reporter(verbose=1, stream=io.StringIO()) + assert not isinstance(reporter, _NullReporter) + + def test_falls_back_to_ascii_reporter_when_rich_is_unavailable( + self, + monkeypatch: pytest.MonkeyPatch, + ): + """``rich`` is an optional extra of the ``optimize`` install group; + when missing, the factory must degrade gracefully to the ASCII + backend so AgentOptimizer still produces a readable timeline.""" + import builtins + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name == "rich" or name.startswith("rich."): + raise ImportError("simulated missing rich dependency") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + reporter = create_reporter(verbose=1, stream=io.StringIO()) + assert isinstance(reporter, _AsciiReporter) + + +class TestAsciiReporterRoundRendering: + """Use the ASCII backend directly so assertions don't depend on rich's + rendering quirks. Rich backend is exercised separately in TestRichReporter. + """ + + def _new(self) -> tuple[_AsciiReporter, io.StringIO]: + buf = io.StringIO() + return _AsciiReporter(stream=buf, verbose=1), buf + + def test_round_accepted_renders_one_line_with_semantic_segments(self): + reporter, buf = self._new() + reporter.round_completed(_round_view()) + output = buf.getvalue() + assert "round 1" in output + assert "accepted" in output + assert "train sample 2/5" in output + assert "sample score 0.00" in output + assert "1.00" in output + assert "valset pass_rate 1.0000" in output + assert "evaluations 12/auto" in output + # Single line per round. + assert output.count("round 1") == 1 + + def test_round_skipped_subsample_perfect_uses_skip_marker(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=2, + train_subsample_parent_score=1.0, + train_subsample_candidate_score=None, + val_pass_rate=None, + accepted=False, + skip_reason="minibatch already perfect (skip_perfect_score on)", + duration_seconds=3.1, + ) + ) + output = buf.getvalue() + assert "round 2" in output + assert "skipped" in output + assert "minibatch already perfect" in output + # No valset segment when skipped pre-val. + assert "valset pass_rate" not in output + + def test_round_skipped_no_proposal_omits_train_segment_when_no_minibatch(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=4, + train_minibatch_size=0, + train_subsample_parent_score=None, + train_subsample_candidate_score=None, + val_pass_rate=None, + accepted=False, + skip_reason="reflect-LM produced no usable new prompt", + duration_seconds=1.2, + ) + ) + output = buf.getvalue() + assert "round 4" in output + assert "skipped" in output + assert "reflect-LM produced no usable new prompt" in output + assert "train sample" not in output + + def test_round_error_uses_error_marker(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=3, + train_subsample_candidate_score=None, + val_pass_rate=None, + accepted=False, + skip_reason=None, + error_message="evaluator timeout", + duration_seconds=15.0, + ) + ) + output = buf.getvalue() + assert "round 3" in output + assert "error" in output.lower() + assert "message: evaluator timeout" in output + + def test_round_explored_when_evaluated_but_not_accepted(self): + reporter, buf = self._new() + reporter.round_completed( + _round_view( + round=6, + accepted=False, + val_pass_rate=0.42, + train_subsample_parent_score=0.3, + train_subsample_candidate_score=0.4, + ) + ) + output = buf.getvalue() + assert "round 6" in output + assert "explored" in output + assert "valset pass_rate 0.4200" in output + + def test_merge_round_renders_with_merge_marker(self): + reporter, buf = self._new() + reporter.round_completed(_round_view(round=7, kind="merge")) + output = buf.getvalue() + assert "round 7" in output + assert "merged" in output.lower() or "merge" in output.lower() + + +class TestAsciiReporterHeaderAndBaseline: + def _new(self) -> tuple[_AsciiReporter, io.StringIO]: + buf = io.StringIO() + return _AsciiReporter(stream=buf, verbose=1), buf + + def test_header_single_target_field_shows_basename_only(self): + reporter, buf = self._new() + reporter.run_started(_header()) + out = buf.getvalue() + assert "gepa_reflective" in out + assert "instruction" in out + # Header collapses file-backed sources to basename so deep paths + # don't dominate the panel; full paths remain in config.snapshot.json. + assert "system.md" in out + assert "agent/prompts/system.md" not in out + assert "train/val" in out or "train" in out.lower() + assert "5" in out and "3" in out + assert "runs/2026-05-17T16-30-00" in out + # Legend is printed once after the header so users can decode subsequent + # per-round lines without scrolling back to documentation. + assert "Round line legend" in out + assert "valset pass_rate" in out + assert "evaluations used/total" in out + + def test_header_multiple_target_fields_keeps_callback_sentinel(self): + reporter, buf = self._new() + reporter.run_started( + _header( + target_fields=[ + ("system_prompt", "prompts/system.md"), + ("user_template", "prompts/user.md"), + ("rubric", ""), + ], + ) + ) + out = buf.getvalue() + assert "system_prompt" in out + assert "user_template" in out + assert "rubric" in out + # File-backed sources collapse to basenames in the panel. + assert "system.md" in out + assert "user.md" in out + # Callback sources keep the explicit sentinel. + assert "" in out + + def test_header_multiple_metrics_shows_count(self): + reporter, buf = self._new() + reporter.run_started( + _header( + metric_names=["final_response_avg_score", "llm_rubric_response"] + ) + ) + out = buf.getvalue() + # B2: metric count visible + assert "2" in out + assert "final_response_avg_score" in out + assert "llm_rubric_response" in out + + def test_header_long_field_name_is_truncated(self): + reporter, buf = self._new() + long_name = "this_is_a_very_long_field_name_that_must_be_truncated_for_display" + reporter.run_started( + _header(target_fields=[(long_name, "prompts/x.md")]) + ) + out = buf.getvalue() + # A4: never echo a line longer than the truncation cap + for line in out.splitlines(): + assert len(line) <= 200 # generous cap on header line width + + def test_baseline_renders_pass_rate_and_breakdown(self): + reporter, buf = self._new() + reporter.baseline_evaluated( + 0.42, {"final_response_avg_score": 0.42} + ) + out = buf.getvalue() + assert "baseline" in out.lower() + assert "0.4200" in out + + def test_baseline_shows_thresholds_and_pass_fail_status(self): + reporter, buf = self._new() + reporter.baseline_evaluated( + 0.5, + {"final_response_avg_score": 0.42, "response_match_score": 0.80}, + metric_thresholds={ + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + }, + ) + out = buf.getvalue() + # Threshold column present. + assert "threshold 0.5000" in out + assert "threshold 0.3000" in out + # PASS / FAIL status reflects evaluator semantics (score >= threshold). + assert "FAIL" in out # 0.42 < 0.5 + assert "PASS" in out # 0.80 >= 0.3 + + +class TestAsciiReporterRunFinished: + def _new(self) -> tuple[_AsciiReporter, io.StringIO]: + buf = io.StringIO() + return _AsciiReporter(stream=buf, verbose=1), buf + + def _make_result(self, **overrides: Any) -> Any: + from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult, RoundRecord + defaults = dict( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=1.0, + pass_rate_improvement=1.0, + baseline_metric_breakdown={}, + best_metric_breakdown={}, + metric_thresholds={}, + baseline_prompts={"instruction": "old"}, + best_prompts={"instruction": "new"}, + total_rounds=2, + rounds=[], + total_reflection_lm_calls=2, + total_judge_model_calls=0, + total_llm_cost=0.0, + total_token_usage={"prompt": 0, "completion": 0, "total": 0}, + duration_seconds=142.86, + started_at="2026-05-17T16:30:00+00:00", + finished_at="2026-05-17T16:32:22+00:00", + extras={}, + ) + defaults.update(overrides) + return OptimizeResult(**defaults) + + def test_summary_panel_shows_improvement_arrow(self): + reporter, buf = self._new() + result = self._make_result() + reporter.run_finished( + result, output_dir="runs/2026-05-17T16-30-00", update_source=False, + ) + out = buf.getvalue() + assert "SUCCEEDED" in out + assert "0.0000" in out and "1.0000" in out + assert "+1.0000" in out or "+1.00" in out + assert "improved" in out + assert "142.86" in out + assert "runs/2026-05-17T16-30-00" in out + + def test_summary_panel_shows_no_improvement_when_flat(self): + reporter, buf = self._new() + result = self._make_result( + best_pass_rate=0.5, + baseline_pass_rate=0.5, + pass_rate_improvement=0.0, + finish_reason="no_improvement", + ) + reporter.run_finished( + result, output_dir="runs/x", update_source=False, + ) + out = buf.getvalue() + assert "no improvement" in out.lower() or "no_improvement" in out + + def test_summary_panel_marks_failed_status(self): + reporter, buf = self._new() + result = self._make_result( + status="FAILED", finish_reason="error", + error_message="dataset load failed: missing file", + ) + reporter.run_finished( + result, output_dir="runs/x", update_source=False, + ) + out = buf.getvalue() + assert "FAILED" in out + assert "dataset load failed" in out + + def test_summary_shows_update_source_when_true(self): + reporter, buf = self._new() + result = self._make_result() + reporter.run_finished( + result, output_dir="runs/x", update_source=True, + ) + out = buf.getvalue() + # G1: update_source visible + assert "update_source" in out + # Mentions the source was written back. + assert "written" in out.lower() or "true" in out.lower() + + def test_summary_shows_stopped_by_required_metrics_passing(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="required_metrics_passing") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "required metrics met thresholds" in out + + def test_summary_shows_stopped_by_budget_exhausted(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="budget_exhausted") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "budget exhausted" in out + # Disambiguates from the legacy catch-all label so users can tell the + # MaxMetricCallsStopper triggered specifically. + assert "max_metric_calls" in out + + def test_summary_shows_stopped_by_no_improvement(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="no_improvement") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "no improvement" in out + + def test_summary_shows_stopped_by_timeout(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="timeout") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "timeout" in out + + def test_summary_shows_stopped_by_score_threshold(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="score_threshold") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "score threshold" in out + + def test_summary_shows_stopped_by_completed_when_no_stopper_fired(self): + reporter, buf = self._new() + result = self._make_result(stop_reason="completed") + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" in out + assert "completed" in out + assert "no stopper triggered" in out + + def test_summary_shows_stopped_by_user_requested_stop(self) -> None: + from trpc_agent_sdk.evaluation._optimize_reporter import _format_stop_reason_text + + assert _format_stop_reason_text("user_requested_stop") == ( + "user requested stop (optimize.stop touched)" + ) + + def test_summary_omits_stopped_by_when_stop_reason_none(self): + reporter, buf = self._new() + result = self._make_result(stop_reason=None) + reporter.run_finished(result, output_dir="runs/x", update_source=False) + out = buf.getvalue() + assert "stopped by" not in out + + def test_summary_per_metric_table_includes_threshold_and_status(self): + reporter, buf = self._new() + result = self._make_result( + baseline_metric_breakdown={ + "final_response_avg_score": 0.42, + "response_match_score": 0.10, + }, + best_metric_breakdown={ + "final_response_avg_score": 1.0, + "response_match_score": 0.20, + }, + metric_thresholds={ + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + }, + ) + reporter.run_finished( + result, output_dir="runs/x", update_source=False, + ) + out = buf.getvalue() + assert "threshold | baseline -> best" in out + assert "threshold 0.5000" in out + assert "threshold 0.3000" in out + # final_response_avg_score 1.0 >= 0.5 → PASS + # response_match_score 0.2 < 0.3 → FAIL + assert "PASS" in out + assert "FAIL" in out + + +class _CapturingHandler(logging.Handler): + """Test helper: collects every record emitted on the attached logger. + + Attached directly to the target logger (rather than relying on root / + caplog) because the ``trpc_agent_sdk`` parent logger sets + ``propagate=False`` once initialised, which would prevent caplog from + seeing child events. + """ + + def __init__(self) -> None: + super().__init__(level=logging.DEBUG) + self.records: list[logging.LogRecord] = [] + + def emit(self, record: logging.LogRecord) -> None: + self.records.append(record) + + +@pytest.fixture +def gepa_log_capture() -> tuple[logging.Logger, list[logging.LogRecord]]: + target = logging.getLogger("trpc_agent_sdk.optimizer.gepa") + handler = _CapturingHandler() + target.addHandler(handler) + previous_level = target.level + target.setLevel(logging.INFO) + try: + yield target, handler.records + finally: + target.removeHandler(handler) + target.setLevel(previous_level) + + +class TestSilentGepaLogger: + """`_SilentGepaLogger` replaces gepa's default StdOutLogger. + + verbose=1: drop every message (no stdout pollution). + verbose=2: forward to logging.getLogger("trpc_agent_sdk.optimizer.gepa") + at INFO level so users can route via logging config. + """ + + def test_verbose_one_drops_message(self, capsys, gepa_log_capture): + _, records = gepa_log_capture + logger = _SilentGepaLogger(verbose=1) + logger.log("Iteration 3: Best valset aggregate score so far: 1.0") + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + assert records == [] + + def test_verbose_two_forwards_to_logging(self, gepa_log_capture): + _, records = gepa_log_capture + logger = _SilentGepaLogger(verbose=2) + logger.log("Iteration 3: Best valset aggregate score so far: 1.0") + assert any( + "Best valset aggregate" in rec.getMessage() + for rec in records + if rec.name == "trpc_agent_sdk.optimizer.gepa" + ) + + +class TestRichBackendFallback: + """When rich is unavailable, factory must fall back to ASCII silently.""" + + def test_create_reporter_falls_back_when_rich_missing(self, monkeypatch): + import builtins + real_import = builtins.__import__ + + def fake_import(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "rich" or name.startswith("rich."): + raise ImportError(f"forced: {name}") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + reporter = create_reporter(verbose=1, stream=io.StringIO()) + assert isinstance(reporter, _AsciiReporter) + + +class TestRichProgressNotAutoRefreshFlood: + """Regression: the Rich Progress region must not flood the timeline with + duplicate "progress ..." lines when stdout does not honour rich's + cursor-up escapes (typical for embedded IDE terminals). + + The bug appeared as dozens of stacked ``progress ...`` rows during a + single GEPA baseline evaluation because rich's default ``auto_refresh`` + fires at 10 Hz; without functioning cursor-up the previous frame was + never erased and every refresh tick became a fresh log line. + + The fix is to build ``Progress`` with ``auto_refresh=False`` and refresh + explicitly on each round event. This test asserts a hard upper bound on + the number of progress lines emitted across a realistic run. + """ + + def test_progress_line_count_is_bounded_by_round_count(self): + from trpc_agent_sdk.evaluation._optimize_reporter import _RichReporter + + buf = io.StringIO() + reporter = _RichReporter(stream=buf, verbose=1) + reporter.run_started(_header(budget_total=60)) + reporter.baseline_evaluated( + 0.0, + {"final_response_avg_score": 0.0, "llm_rubric_response": 1.0}, + metric_thresholds={ + "final_response_avg_score": 1.0, + "llm_rubric_response": 0.66, + }, + ) + for round_no in range(1, 7): + reporter.round_completed( + _round_view( + round=round_no, + accepted=(round_no == 1), + skip_reason=None if round_no == 1 else "all_scores_perfect", + train_subsample_parent_score=1.0, + train_subsample_candidate_score=None, + val_pass_rate=0.6667 if round_no == 1 else None, + budget_used=10 + (round_no - 1) * 2, + budget_total=60, + ) + ) + reporter._stop_progress() + progress_lines = [ + line + for line in buf.getvalue().splitlines() + if line.lstrip().startswith("progress") + ] + # A well-behaved Live region produces at most one progress line per + # discrete event (start + 6 rounds = 7). A regression that re-enables + # auto_refresh at 10 Hz over a multi-minute baseline trivially exceeds + # this bound by an order of magnitude (we saw 30+ in the wild). + assert len(progress_lines) <= 8, ( + f"too many progress lines: {len(progress_lines)} — " + f"auto_refresh may have been re-enabled" + ) diff --git a/tests/evaluation/test_optimize_result.py b/tests/evaluation/test_optimize_result.py new file mode 100644 index 0000000..2bbe93e --- /dev/null +++ b/tests/evaluation/test_optimize_result.py @@ -0,0 +1,456 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for OptimizeResult / RoundRecord / dump_to / from_file.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from trpc_agent_sdk.evaluation._optimize_result import OptimizeResult +from trpc_agent_sdk.evaluation._optimize_result import RoundRecord + + +def _round_record(round_idx: int = 1, accepted: bool = True) -> RoundRecord: + return RoundRecord( + round=round_idx, + optimized_field_names=["system_prompt"], + candidate_prompts={"system_prompt": f"v{round_idx}"}, + train_pass_rate=0.5 + 0.1 * round_idx, + validation_pass_rate=0.4 + 0.1 * round_idx, + metric_breakdown={"final_response_avg_score": 0.6}, + accepted=accepted, + acceptance_reason=("validation_pass_rate gain 0.10 >= min_score_gain 0.0" + if accepted else "validation_pass_rate gain -0.02 < min_score_gain 0.0"), + failed_case_ids=["c1", "c2"], + failed_cases_truncated=0, + per_field_diagnosis={"system_prompt": "model said: be more careful"}, + reflection_lm_calls=1, + round_llm_cost=0.012, + round_token_usage={"prompt": 100, "completion": 50, "total": 150}, + started_at="2026-05-14T19:30:00Z", + duration_seconds=2.5, + ) + + +def _optimize_result(rounds: list[RoundRecord] | None = None) -> OptimizeResult: + rounds = rounds or [_round_record(1, accepted=True)] + return OptimizeResult( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.4, + best_pass_rate=0.6, + pass_rate_improvement=0.2, + baseline_metric_breakdown={"final_response_avg_score": 0.5}, + best_metric_breakdown={"final_response_avg_score": 0.7}, + baseline_prompts={"system_prompt": "v0"}, + best_prompts={"system_prompt": "v1"}, + total_rounds=len(rounds), + rounds=rounds, + total_reflection_lm_calls=1, + total_judge_model_calls=8, + total_llm_cost=0.05, + total_token_usage={"prompt": 200, "completion": 100, "total": 300}, + duration_seconds=5.0, + started_at="2026-05-14T19:30:00Z", + finished_at="2026-05-14T19:30:05Z", + ) + + +def test_optimize_result_algorithm_field_required(): + """algorithm must be a top-level required field per spec §3.6 / acceptance #20.""" + import pydantic + + with pytest.raises(pydantic.ValidationError) as exc: + OptimizeResult( + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + total_rounds=0, + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="t0", + finished_at="t1", + ) + assert any("algorithm" in str(e["loc"]) for e in exc.value.errors()) + + +def test_optimize_result_algorithm_field_round_trips(tmp_path: Path): + result = _optimize_result() + assert result.algorithm == "gepa_reflective" + target = tmp_path / "r.json" + result.dump_to(str(target)) + loaded = OptimizeResult.from_file(str(target)) + assert loaded.algorithm == "gepa_reflective" + + +def test_optimize_result_metric_thresholds_defaults_to_empty_dict(): + result = _optimize_result() + assert result.metric_thresholds == {} + + +def test_optimize_result_metric_thresholds_round_trip(tmp_path: Path): + result = _optimize_result().model_copy( + update={ + "metric_thresholds": { + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + } + } + ) + path = tmp_path / "with_thresholds.json" + result.dump_to(str(path)) + loaded = OptimizeResult.from_file(str(path)) + assert loaded.metric_thresholds == { + "final_response_avg_score": 0.5, + "response_match_score": 0.3, + } + + +def test_optimize_result_format_summary_includes_thresholds_when_provided(): + result = _optimize_result().model_copy( + update={ + "metric_thresholds": {"final_response_avg_score": 0.5}, + "baseline_metric_breakdown": {"final_response_avg_score": 0.4}, + "best_metric_breakdown": {"final_response_avg_score": 0.9}, + } + ) + summary = result.format_summary(output_dir="/tmp/runs/x", update_source=False) + assert "threshold | baseline -> best" in summary + assert "threshold 0.5000" in summary + assert "0.4000 -> 0.9000" in summary + + +def test_round_record_minimal_construction(): + record = _round_record() + assert record.round == 1 + assert record.accepted is True + assert record.round_llm_cost == 0.012 + assert record.round_token_usage == {"prompt": 100, "completion": 50, "total": 150} + + +def test_round_record_extras_defaults_to_empty_dict(): + record = _round_record() + assert record.extras == {} + + +def test_round_record_extras_accepts_arbitrary_payload(): + record = RoundRecord( + round=1, + optimized_field_names=["a"], + candidate_prompts={"a": "x"}, + train_pass_rate=0.5, + validation_pass_rate=0.5, + metric_breakdown={}, + accepted=False, + acceptance_reason="", + failed_case_ids=[], + failed_cases_truncated=0, + per_field_diagnosis={}, + reflection_lm_calls=0, + round_llm_cost=0.0, + round_token_usage={"prompt": 0, "completion": 0, "total": 0}, + started_at="2026-05-14T19:30:00Z", + duration_seconds=1.0, + extras={"judge_subscores": [0.5, 0.6], "wandb_step": 7}, + ) + assert record.extras["judge_subscores"] == [0.5, 0.6] + assert record.extras["wandb_step"] == 7 + + +def test_optimize_result_minimal_construction(): + result = _optimize_result() + assert result.schema_version == "v1" + assert result.status == "SUCCEEDED" + assert result.finish_reason == "completed" + assert result.baseline_pass_rate == 0.4 + assert result.best_pass_rate == 0.6 + assert result.pass_rate_improvement == 0.2 + assert result.total_rounds == 1 + assert len(result.rounds) == 1 + assert result.extras == {} + + +def test_optimize_result_default_token_usage_is_zero(): + result = OptimizeResult( + algorithm="gepa_reflective", + status="SUCCEEDED", + finish_reason="completed", + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + baseline_metric_breakdown={}, + best_metric_breakdown={}, + baseline_prompts={}, + best_prompts={}, + total_rounds=0, + rounds=[], + total_reflection_lm_calls=0, + total_judge_model_calls=0, + duration_seconds=0.0, + started_at="2026-05-14T19:30:00Z", + finished_at="2026-05-14T19:30:00Z", + ) + assert result.total_llm_cost == 0.0 + assert result.total_token_usage == {"prompt": 0, "completion": 0, "total": 0} + assert result.extras == {} + assert result.error_message == "" + + +@pytest.mark.parametrize("status", ["SUCCEEDED", "FAILED", "CANCELED"]) +def test_optimize_result_run_status_accepts_all_legal_values(status): + result = _optimize_result() + new = result.model_copy(update={"status": status}) + assert new.status == status + + +def test_optimize_result_rejects_illegal_run_status(): + with pytest.raises(Exception): + OptimizeResult.model_validate({**_optimize_result().model_dump(), "status": "unknown"}) + + +@pytest.mark.parametrize("reason", [ + "completed", + "perfect_pass_rate", + "no_improvement", + "error", +]) +def test_optimize_result_finish_reason_accepts_all_legal_values(reason): + result = _optimize_result() + new = result.model_copy(update={"finish_reason": reason}) + assert new.finish_reason == reason + + +def test_optimize_result_rejects_illegal_finish_reason(): + with pytest.raises(Exception): + OptimizeResult.model_validate({**_optimize_result().model_dump(), "finish_reason": "weird"}) + + +def test_optimize_result_rejects_removed_cancelled_finish_reason(): + """DOC-4: 'cancelled' was removed from FinishReason because no SDK code path + ever produces it; user cancellation surfaces as stop_reason='user_requested_stop' + + status='CANCELED'. Schema must reject it to keep the literal set honest.""" + with pytest.raises(Exception): + OptimizeResult.model_validate( + {**_optimize_result().model_dump(), "finish_reason": "cancelled"} + ) + + +def test_optimize_result_model_dump_json_round_trip(): + original = _optimize_result() + payload = original.model_dump_json() + restored = OptimizeResult.model_validate_json(payload) + assert restored == original + + +def test_optimize_result_dump_to_creates_indented_json_file(tmp_path: Path): + path = tmp_path / "result.json" + result = _optimize_result() + result.dump_to(str(path)) + assert path.exists() + text = path.read_text(encoding="utf-8") + payload = json.loads(text) + assert payload["status"] == "SUCCEEDED" + assert payload["finishReason"] == "completed" + assert "\n" in text + + +def test_optimize_result_from_file_round_trip(tmp_path: Path): + path = tmp_path / "result.json" + original = _optimize_result() + original.dump_to(str(path)) + restored = OptimizeResult.from_file(str(path)) + assert restored == original + + +def test_round_record_new_reporter_fields_default_to_none_or_zero(): + """New fields reporter and artifact persistence consume must default + safely so existing callers keep working unchanged.""" + record = _round_record() + assert record.kind == "reflective" + assert record.train_minibatch_size == 0 + assert record.train_subsample_parent_score is None + assert record.train_subsample_candidate_score is None + assert record.skip_reason is None + assert record.error_message is None + assert record.budget_used is None + assert record.budget_total is None + + +def test_round_record_new_reporter_fields_round_trip(): + record = RoundRecord( + round=2, + optimized_field_names=[], + candidate_prompts={"a": "x"}, + train_pass_rate=0.5, + validation_pass_rate=0.0, + metric_breakdown={}, + accepted=False, + acceptance_reason="", + failed_case_ids=[], + failed_cases_truncated=0, + per_field_diagnosis={}, + reflection_lm_calls=0, + round_llm_cost=0.0, + round_token_usage={"prompt": 0, "completion": 0, "total": 0}, + started_at="2026-05-17T16:30:00Z", + duration_seconds=2.1, + kind="merge", + train_minibatch_size=2, + train_subsample_parent_score=0.6, + train_subsample_candidate_score=0.4, + skip_reason=None, + error_message=None, + budget_used=42, + budget_total=200, + ) + payload = record.model_dump_json() + restored = RoundRecord.model_validate_json(payload) + assert restored == record + assert restored.kind == "merge" + assert restored.train_minibatch_size == 2 + assert restored.budget_used == 42 + assert restored.budget_total == 200 + + +def test_optimize_result_format_summary_succeeded_contains_key_fields(): + """format_summary renders the human-readable summary.txt artifact and + must surface algorithm, status, baseline/best pass rates, delta, + rounds and best_prompts inventory.""" + result = _optimize_result() + summary = result.format_summary( + output_dir="/tmp/runs/2026-05-17T16-30-00", + update_source=False, + ) + assert "gepa_reflective" in summary + assert "SUCCEEDED" in summary + assert "0.4000" in summary and "0.6000" in summary + assert "+0.2000" in summary or "+0.20" in summary + assert "improved" in summary + assert "system_prompt" in summary + assert "/tmp/runs/2026-05-17T16-30-00" in summary + + +def test_optimize_result_format_summary_failed_includes_error_message(): + result = _optimize_result().model_copy(update={ + "status": "FAILED", + "finish_reason": "error", + "error_message": "dataset load failed: missing file", + }) + summary = result.format_summary( + output_dir="/tmp/runs/x", update_source=True, + ) + assert "FAILED" in summary + assert "dataset load failed" in summary + + +def test_optimize_result_from_file_missing_path_raises(tmp_path: Path): + with pytest.raises(FileNotFoundError): + OptimizeResult.from_file(str(tmp_path / "nope.json")) + + +def test_optimize_result_camel_alias_export(): + result = _optimize_result() + dumped = result.model_dump(by_alias=True) + assert "schemaVersion" in dumped + assert "finishReason" in dumped + assert "baselinePassRate" in dumped + assert "totalTokenUsage" in dumped + + +def test_optimize_result_camel_case_input_accepted(): + payload = _optimize_result().model_dump(by_alias=True) + restored = OptimizeResult.model_validate(payload) + assert restored == _optimize_result() + + +def test_optimize_result_extras_round_trip_through_file(tmp_path: Path): + result = _optimize_result().model_copy( + update={"extras": {"wandb_run_id": "abc-123", "git_sha": "deadbeef"}} + ) + path = tmp_path / "result.json" + result.dump_to(str(path)) + restored = OptimizeResult.from_file(str(path)) + assert restored.extras == {"wandb_run_id": "abc-123", "git_sha": "deadbeef"} + + +def test_optimize_result_dump_to_overwrites_existing_file(tmp_path: Path): + path = tmp_path / "result.json" + path.write_text("stale content", encoding="utf-8") + result = _optimize_result() + result.dump_to(str(path)) + payload = json.loads(path.read_text(encoding="utf-8")) + assert payload["status"] == "SUCCEEDED" + + +def test_optimize_result_with_multiple_rounds(): + rounds = [ + _round_record(round_idx=1, accepted=True), + _round_record(round_idx=2, accepted=False), + _round_record(round_idx=3, accepted=True), + ] + result = _optimize_result(rounds=rounds) + assert result.total_rounds == 3 + assert result.rounds[1].accepted is False + payload = result.model_dump_json() + restored = OptimizeResult.model_validate_json(payload) + assert [r.accepted for r in restored.rounds] == [True, False, True] + + +# --------------------------------------------------------------------------- +# stop_reason +# --------------------------------------------------------------------------- + + +def test_optimize_result_stop_reason_defaults_to_none(): + result = _optimize_result() + assert result.stop_reason is None + + +@pytest.mark.parametrize( + "reason", ["required_metrics_passing", "budget_exhausted"], +) +def test_optimize_result_stop_reason_accepts_legal_values(reason): + result = _optimize_result().model_copy(update={"stop_reason": reason}) + assert result.stop_reason == reason + + +def test_optimize_result_stop_reason_rejects_illegal_value(): + with pytest.raises(Exception): + OptimizeResult.model_validate( + {**_optimize_result().model_dump(), "stop_reason": "weird"} + ) + + +def test_optimize_result_stop_reason_round_trip(tmp_path: Path): + result = _optimize_result().model_copy( + update={"stop_reason": "required_metrics_passing"} + ) + target = tmp_path / "r.json" + result.dump_to(str(target)) + loaded = OptimizeResult.from_file(str(target)) + assert loaded.stop_reason == "required_metrics_passing" + + +def test_optimize_result_format_summary_includes_stop_reason_when_set(): + result = _optimize_result().model_copy( + update={"stop_reason": "required_metrics_passing"} + ) + summary = result.format_summary(output_dir="/tmp/x", update_source=False) + assert "stop_reason" in summary + assert "required_metrics_passing" in summary + + +def test_optimize_result_format_summary_omits_stop_reason_when_none(): + result = _optimize_result() + summary = result.format_summary(output_dir="/tmp/x", update_source=False) + assert "stop_reason" not in summary diff --git a/tests/evaluation/test_remote_eval_service.py b/tests/evaluation/test_remote_eval_service.py index 77d0318..3f16fd1 100644 --- a/tests/evaluation/test_remote_eval_service.py +++ b/tests/evaluation/test_remote_eval_service.py @@ -128,6 +128,36 @@ async def call_agent(query: str) -> str: _ = [r async for r in service.evaluate(evaluate_req)] +@pytest.mark.asyncio +async def test_reject_llm_rubric_knowledge_recall_metric_raises_value_error(): + """F-4: ``llm_rubric_knowledge_recall`` requires tool responses from + ``intermediate_data`` which RemoteEvalService never captures (always + ``None``); the judge would silently fall back to "No knowledge search + results were found." Treat it the same as ``tool_trajectory_avg_score``: + fail-fast at evaluate() entry instead of letting users score every + case as 0 and chase a phantom prompt regression. + """ + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + + async def call_agent(query: str) -> str: + return "world" + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1)) + inference_results = [r async for r in service.perform_inference(req)] + evaluate_req = EvaluateRequest( + inference_results=inference_results, + evaluate_config=EvaluateConfig( + eval_metrics=[EvalMetric(metric_name="llm_rubric_knowledge_recall", threshold=1.0)], + ), + ) + + with pytest.raises(ValueError, match="llm_rubric_knowledge_recall"): + _ = [r async for r in service.evaluate(evaluate_req)] + + @pytest.mark.asyncio async def test_case_fail_soft_when_call_agent_raises(): case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) diff --git a/tests/evaluation/test_target_prompt.py b/tests/evaluation/test_target_prompt.py new file mode 100644 index 0000000..8a2c7b4 --- /dev/null +++ b/tests/evaluation/test_target_prompt.py @@ -0,0 +1,539 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tests for TargetPrompt.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Callable +from unittest import mock + +import pytest + +from trpc_agent_sdk.evaluation._target_prompt import TargetPrompt + + +def _write(path: Path, text: str) -> Path: + path.write_text(text, encoding="utf-8") + return path + + +def test_add_path_returns_self_for_chaining(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "A") + p2 = _write(tmp_path / "b.md", "B") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + assert target.names() == ["a", "b"] + + +def test_add_callback_returns_self_for_chaining(): + async def _read() -> str: + return "x" + + async def _write_fn(value: str) -> None: + return None + + target = ( + TargetPrompt() + .add_callback("c1", read=_read, write=_write_fn) + .add_callback("c2", read=_read, write=_write_fn) + ) + assert target.names() == ["c1", "c2"] + + +def test_names_in_registration_order(tmp_path: Path): + p = _write(tmp_path / "x.md", "x") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + target = ( + TargetPrompt() + .add_path("first", str(p)) + .add_callback("second", read=_read, write=_write_fn) + .add_path("third", str(p)) + ) + assert target.names() == ["first", "second", "third"] + + +def test_add_path_duplicate_name_raises_value_error(tmp_path: Path): + p = _write(tmp_path / "a.md", "A") + target = TargetPrompt().add_path("a", str(p)) + with pytest.raises(ValueError, match="already registered"): + target.add_path("a", str(p)) + + +def test_add_callback_duplicate_name_raises_value_error(): + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_callback("c", read=_read, write=_write_fn) + with pytest.raises(ValueError, match="already registered"): + target.add_callback("c", read=_read, write=_write_fn) + + +def test_add_path_and_callback_same_name_raises(tmp_path: Path): + p = _write(tmp_path / "a.md", "A") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_path("a", str(p)) + with pytest.raises(ValueError, match="already registered"): + target.add_callback("a", read=_read, write=_write_fn) + + +def test_empty_target_prompt_names_is_empty(): + assert TargetPrompt().names() == [] + + +def test_add_callback_requires_async_read_callable(): + def _sync_read() -> str: + return "" + + async def _write_fn(value: str) -> None: + return None + + with pytest.raises(TypeError, match="async"): + TargetPrompt().add_callback("c", read=_sync_read, write=_write_fn) + + +def test_add_callback_requires_async_write_callable(): + async def _read() -> str: + return "" + + def _sync_write(value: str) -> None: + return None + + with pytest.raises(TypeError, match="async"): + TargetPrompt().add_callback("c", read=_read, write=_sync_write) + + +@pytest.mark.asyncio +async def test_read_all_with_paths(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "alpha") + p2 = _write(tmp_path / "b.md", "beta") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + assert await target.read_all() == {"a": "alpha", "b": "beta"} + + +@pytest.mark.asyncio +async def test_read_all_path_not_exist_raises_file_not_found(tmp_path: Path): + target = TargetPrompt().add_path("missing", str(tmp_path / "ghost.md")) + with pytest.raises(FileNotFoundError): + await target.read_all() + + +@pytest.mark.asyncio +async def test_read_all_with_async_callback(): + async def _read() -> str: + return "callback-value" + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_callback("k", read=_read, write=_write_fn) + assert await target.read_all() == {"k": "callback-value"} + + +@pytest.mark.asyncio +async def test_read_all_callback_raises_propagates(): + async def _read() -> str: + raise RuntimeError("remote down") + + async def _write_fn(value: str) -> None: + return None + + target = TargetPrompt().add_callback("k", read=_read, write=_write_fn) + with pytest.raises(RuntimeError, match="remote down"): + await target.read_all() + + +@pytest.mark.asyncio +async def test_read_all_mixed_path_and_callback(tmp_path: Path): + p = _write(tmp_path / "p.md", "from-file") + + async def _read() -> str: + return "from-callback" + + async def _write_fn(value: str) -> None: + return None + + target = ( + TargetPrompt() + .add_path("a", str(p)) + .add_callback("b", read=_read, write=_write_fn) + ) + assert await target.read_all() == {"a": "from-file", "b": "from-callback"} + + +@pytest.mark.asyncio +async def test_read_single_field(tmp_path: Path): + p = _write(tmp_path / "a.md", "single") + target = TargetPrompt().add_path("a", str(p)) + assert await target.read("a") == "single" + + +@pytest.mark.asyncio +async def test_read_unknown_name_raises_key_error(): + target = TargetPrompt() + with pytest.raises(KeyError): + await target.read("nope") + + +@pytest.mark.asyncio +async def test_write_all_with_paths_updates_files(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "old-a") + p2 = _write(tmp_path / "b.md", "old-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + await target.write_all({"a": "new-a", "b": "new-b"}) + assert p1.read_text(encoding="utf-8") == "new-a" + assert p2.read_text(encoding="utf-8") == "new-b" + + +@pytest.mark.asyncio +async def test_write_all_with_callback_invokes_write(): + received: dict[str, str] = {} + + async def _read() -> str: + return received.get("k", "") + + async def _write_fn(value: str) -> None: + received["k"] = value + + target = TargetPrompt().add_callback("k", read=_read, write=_write_fn) + await target.write_all({"k": "callback-payload"}) + assert received == {"k": "callback-payload"} + + +@pytest.mark.asyncio +async def test_write_all_keys_mismatch_raises(tmp_path: Path): + p = _write(tmp_path / "a.md", "A") + target = TargetPrompt().add_path("a", str(p)) + + with pytest.raises(ValueError, match="mismatch"): + await target.write_all({}) + + with pytest.raises(ValueError, match="mismatch"): + await target.write_all({"a": "ok", "extra": "x"}) + + +@pytest.mark.asyncio +async def test_write_all_no_tmp_file_remains_on_success(tmp_path: Path): + p = _write(tmp_path / "a.md", "old") + target = TargetPrompt().add_path("a", str(p)) + await target.write_all({"a": "new"}) + assert p.read_text(encoding="utf-8") == "new" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_atomic_rollback_on_partial_failure(tmp_path: Path): + p1 = _write(tmp_path / "a.md", "old-a") + p2 = _write(tmp_path / "b.md", "old-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + original_replace = os.replace + seen: dict[str, int] = {"count": 0} + + def _flaky_replace(src: str, dst: str) -> None: + seen["count"] += 1 + if seen["count"] == 2: + raise OSError("simulated disk failure on second rename") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError, match="simulated"): + await target.write_all({"a": "new-a", "b": "new-b"}) + + # Atomicity contract: every source file is restored to its pre-call content. + assert p1.read_text(encoding="utf-8") == "old-a" + assert p2.read_text(encoding="utf-8") == "old-b" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +# --------------------------------------------------------------------------- +# CONC-3 fix: rollback uses atomic primitives + best-effort failure aggregation. +# Test matrix: T1/T2/T7 already covered above; below adds T3-T8 + edge cases. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_write_all_callback_failure_rolls_back_paths(tmp_path: Path): + """T4: callback write fails after path writes succeed; path fields must + be restored to baseline. The callback failure is propagated.""" + p1 = _write(tmp_path / "a.md", "old-a") + p2 = _write(tmp_path / "b.md", "old-b") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + raise RuntimeError("simulated KV write failure") + + target = ( + TargetPrompt() + .add_path("a", str(p1)) + .add_path("b", str(p2)) + .add_callback("c", read=_read, write=_write_fn) + ) + + with pytest.raises(RuntimeError, match="simulated KV"): + await target.write_all({"a": "new-a", "b": "new-b", "c": "new-c"}) + + assert p1.read_text(encoding="utf-8") == "old-a" + assert p2.read_text(encoding="utf-8") == "old-b" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_rolls_back_to_unlink_when_baseline_absent(tmp_path: Path): + """T5: file did not exist before write_all (backup=None); rollback path + must unlink the file rather than restore content.""" + p1 = _write(tmp_path / "a.md", "old-a") + ghost = tmp_path / "ghost.md" + assert not ghost.exists() + + target = TargetPrompt().add_path("ghost", str(ghost)).add_path("a", str(p1)) + + original_replace = os.replace + seen = {"count": 0} + + def _flaky_replace(src: str, dst: str) -> None: + seen["count"] += 1 + # registration order: ghost first, a second + # call 1 = ghost.md.tmp -> ghost.md (succeeds, ghost newly created) + # call 2 = a.md.tmp -> a.md (fails -> rollback for [ghost]) + if seen["count"] == 2: + raise OSError("simulated failure on second rename") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError, match="simulated"): + await target.write_all({"ghost": "new-ghost", "a": "new-a"}) + + assert not ghost.exists() + assert p1.read_text(encoding="utf-8") == "old-a" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_rollback_failure_aggregates_and_chains_root_cause(tmp_path: Path): + """T3+T6: both forward write and rollback restore fail. Aggregate + _RollbackError lists the failed field; root cause preserved on __cause__.""" + from trpc_agent_sdk.evaluation._target_prompt import _RollbackError + + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + original_replace = os.replace + call_count = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + call_count["n"] += 1 + # call 1: forward a.md (succeeds) + # call 2: forward b.md (fails -> rollback for [a]) + # call 3: rollback a-restore (fails too) + if call_count["n"] == 2: + raise OSError("primary write failure") + if call_count["n"] >= 3: + raise PermissionError("rollback restore denied") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(_RollbackError) as excinfo: + await target.write_all({"a": "new-a", "b": "new-b"}) + + err = excinfo.value + assert "a" in str(err) + assert "PermissionError" in str(err) + assert isinstance(err.__cause__, OSError) + assert "primary write failure" in str(err.__cause__) + assert len(err.failures) == 1 + assert err.failures[0][0] == "a" + assert isinstance(err.failures[0][1], PermissionError) + + +@pytest.mark.asyncio +async def test_write_all_rollback_unlink_failure_aggregated(tmp_path: Path): + """T6 variant: backup=None case; unlink fails -> _RollbackError carries it.""" + from trpc_agent_sdk.evaluation._target_prompt import _RollbackError + + p1 = _write(tmp_path / "a.md", "baseline-a") + ghost = tmp_path / "ghost.md" + target = TargetPrompt().add_path("ghost", str(ghost)).add_path("a", str(p1)) + + original_replace = os.replace + original_unlink = os.unlink + state = {"replace_count": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["replace_count"] += 1 + # call 1: ghost.md.tmp -> ghost.md (succeeds) + # call 2: a.md.tmp -> a.md (fails -> rollback for [ghost]) + if state["replace_count"] == 2: + raise OSError("primary failure on a.md") + return original_replace(src, dst) + + def _flaky_unlink(path: str) -> None: + if str(path) == str(ghost): + raise PermissionError("unlink denied") + return original_unlink(path) + + with mock.patch("os.replace", side_effect=_flaky_replace), \ + mock.patch("os.unlink", side_effect=_flaky_unlink): + with pytest.raises(_RollbackError) as excinfo: + await target.write_all({"ghost": "g", "a": "new-a"}) + + err = excinfo.value + assert "ghost" in str(err) + assert "PermissionError" in str(err) + assert isinstance(err.__cause__, OSError) + assert "primary failure on a.md" in str(err.__cause__) + + +@pytest.mark.asyncio +async def test_write_all_rollback_continues_after_partial_failure(tmp_path: Path): + """T3 best-effort: when field A's rollback fails, field B's rollback + still runs and succeeds.""" + from trpc_agent_sdk.evaluation._target_prompt import _RollbackError + + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + p3 = _write(tmp_path / "c.md", "baseline-c") + target = ( + TargetPrompt() + .add_path("a", str(p1)) + .add_path("b", str(p2)) + .add_path("c", str(p3)) + ) + + original_replace = os.replace + state = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["n"] += 1 + # forward: 1=a, 2=b, 3=c (fails -> rollback for [a, b]) + # rollback: 4=a-restore (fails), 5=b-restore (succeeds) + if state["n"] == 3: + raise OSError("primary failure on c") + if state["n"] == 4: + raise PermissionError("rollback a denied") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(_RollbackError) as excinfo: + await target.write_all({"a": "new-a", "b": "new-b", "c": "new-c"}) + + # Best-effort: b's rollback ran and succeeded. + assert p2.read_text(encoding="utf-8") == "baseline-b" + err = excinfo.value + assert len(err.failures) == 1 + assert err.failures[0][0] == "a" + + +@pytest.mark.asyncio +async def test_write_all_rollback_uses_atomic_primitive(tmp_path: Path, monkeypatch): + """T8: critical regression. Rollback restore path must go through + _atomic_write_path (tmp + os.replace), not raw Path.write_text.""" + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + rollback_calls: list[str] = [] + original_atomic = TargetPrompt._atomic_write_path + + def _spy_atomic(path: str, content: str) -> None: + rollback_calls.append(path) + return original_atomic(path, content) + + monkeypatch.setattr(TargetPrompt, "_atomic_write_path", staticmethod(_spy_atomic)) + + original_replace = os.replace + state = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["n"] += 1 + if state["n"] == 2: + raise OSError("simulated") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError, match="simulated"): + await target.write_all({"a": "new-a", "b": "new-b"}) + + # forward writes for a + b (2 calls), then rollback restore for a (1 call) = 3. + # If rollback used raw write_text, the third call would not appear. + assert len(rollback_calls) == 3 + assert str(p1) in rollback_calls + assert p1.read_text(encoding="utf-8") == "baseline-a" + + +@pytest.mark.asyncio +async def test_write_all_keyboard_interrupt_during_callback_still_rolls_back(tmp_path: Path): + """KeyboardInterrupt is BaseException; except BaseException ensures + rollback still runs for path fields when interrupted mid-callback.""" + p1 = _write(tmp_path / "a.md", "baseline-a") + + async def _read() -> str: + return "" + + async def _write_fn(value: str) -> None: + raise KeyboardInterrupt() + + target = ( + TargetPrompt() + .add_path("a", str(p1)) + .add_callback("c", read=_read, write=_write_fn) + ) + + with pytest.raises(KeyboardInterrupt): + await target.write_all({"a": "new-a", "c": "new-c"}) + + assert p1.read_text(encoding="utf-8") == "baseline-a" + leftover = [f for f in os.listdir(tmp_path) if f.endswith(".tmp")] + assert leftover == [] + + +@pytest.mark.asyncio +async def test_write_all_no_tmp_left_after_rollback(tmp_path: Path): + """T7 extension: after forward fail + rollback success, no .tmp residue + anywhere in the directory.""" + p1 = _write(tmp_path / "a.md", "baseline-a") + p2 = _write(tmp_path / "b.md", "baseline-b") + target = TargetPrompt().add_path("a", str(p1)).add_path("b", str(p2)) + + original_replace = os.replace + state = {"n": 0} + + def _flaky_replace(src: str, dst: str) -> None: + state["n"] += 1 + if state["n"] == 2: + raise OSError("simulated") + return original_replace(src, dst) + + with mock.patch("os.replace", side_effect=_flaky_replace): + with pytest.raises(OSError): + await target.write_all({"a": "new-a", "b": "new-b"}) + + leftover = sorted(f for f in os.listdir(tmp_path) if f.endswith(".tmp")) + assert leftover == [] + assert p1.read_text(encoding="utf-8") == "baseline-a" + assert p2.read_text(encoding="utf-8") == "baseline-b" diff --git a/trpc_agent_sdk/evaluation/__init__.py b/trpc_agent_sdk/evaluation/__init__.py index 4c87990..8f614b4 100644 --- a/trpc_agent_sdk/evaluation/__init__.py +++ b/trpc_agent_sdk/evaluation/__init__.py @@ -179,10 +179,72 @@ from ._user_simulator_base import Status from ._user_simulator_base import UserSimulator from ._user_simulator_provider import UserSimulatorProvider +from ._agent_optimizer import AgentOptimizer +from ._base_optimizer import BaseOptimizer +from ._optimize_config import FrameworkStopConfig +from ._optimize_config import GepaReflectiveAlgo +from ._optimize_config import OptimizeConfig +from ._optimize_config import OptimizeConfigFile +from ._optimize_config import load_optimize_config +from ._optimize_evaluator_call import EvaluationOutcome +from ._optimize_evaluator_call import run_evaluator +from ._optimize_evaluator_call import summarize_outcome +from ._optimize_gepa_reflective import GepaReflectiveOptimizer +from ._optimize_metric_info import build_metric_reference_doc +from ._optimize_metric_info import build_metric_section +from ._optimize_metric_info import build_reflection_prompt_template +from ._optimize_model_callable import DEFAULT_OPTIMIZE_MAX_TOKENS +from ._optimize_model_callable import DEFAULT_OPTIMIZE_TEMPERATURE +from ._optimize_model_options import OptimizeModelOptions +from ._optimize_registry import OPTIMIZER_REGISTRY +from ._optimize_registry import OptimizerRegistry +from ._optimize_reporter import OptimizeReporter +from ._optimize_reporter import RoundView +from ._optimize_reporter import RunHeader +from ._optimize_reporter import create_reporter +from ._optimize_result import FinishReason +from ._optimize_result import OptimizeResult +from ._optimize_result import RoundKind +from ._optimize_result import RoundRecord +from ._optimize_result import RunStatus +from ._optimize_result import StopReason +from ._target_prompt import TargetPrompt from ._utils import EvalResultHandler from ._utils import MetricRunRecord +from . import _optimize_registrations # noqa: F401 # triggers algorithm registrations + __all__ = [ + "AgentOptimizer", + "BaseOptimizer", + "DEFAULT_OPTIMIZE_MAX_TOKENS", + "DEFAULT_OPTIMIZE_TEMPERATURE", + "EvaluationOutcome", + "FinishReason", + "FrameworkStopConfig", + "GepaReflectiveAlgo", + "GepaReflectiveOptimizer", + "OPTIMIZER_REGISTRY", + "OptimizeConfig", + "OptimizeConfigFile", + "OptimizeModelOptions", + "OptimizeReporter", + "OptimizeResult", + "OptimizerRegistry", + "RoundKind", + "RoundRecord", + "RoundView", + "RunHeader", + "RunStatus", + "StopReason", + "build_metric_reference_doc", + "build_metric_section", + "build_reflection_prompt_template", + "create_reporter", + "run_evaluator", + "summarize_outcome", + "TargetPrompt", + "load_optimize_config", "CRITERION_REGISTRY", "CriterionRegistry", "CriterionType", diff --git a/trpc_agent_sdk/evaluation/_agent_evaluator.py b/trpc_agent_sdk/evaluation/_agent_evaluator.py index f4d1da6..a2e4700 100644 --- a/trpc_agent_sdk/evaluation/_agent_evaluator.py +++ b/trpc_agent_sdk/evaluation/_agent_evaluator.py @@ -75,6 +75,24 @@ _RESULT_HANDLER = _utils.EvalResultHandler() +class _EvaluationCasesFailed(AssertionError): + """Signal raised by ``_EvalExecuter._run`` when one or more eval cases fail. + + Subclasses :class:`AssertionError` so direct ``AgentEvaluator.evaluate`` + callers (CI pytest gates such as ``examples/optimization/ci_integration``) + keep working unchanged: ``except AssertionError`` and + ``isinstance(exc, AssertionError)`` both still match, and the formatted + message remains the JSON failure summary so pytest JUnit XML output is + byte-for-byte identical to the previous ``assert False, combined``. + + Internal optimizer wrappers (``_optimize_evaluator_call.run_evaluator``) + catch this concrete subclass so unrelated ``AssertionError`` (e.g. numpy + ``assert allclose``) is no longer silently swallowed. Replacing the bare + ``assert`` statement also keeps the failure signal alive under + ``python -O`` where ``assert`` is stripped. + """ + + @dataclass(frozen=True) class PassNC: """(n, c): n = runs, c = runs that all passed (for pass@k / pass^k).""" @@ -101,6 +119,7 @@ def __init__( case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, eval_metrics_file_path_or_dir: Optional[str] = None, + print_summary_report: bool = True, ): self._agent_module = agent_module self._call_agent = call_agent @@ -108,6 +127,7 @@ def __init__( self._num_runs = num_runs self._agent_name = agent_name self._print_detailed_results = print_detailed_results + self._print_summary_report = print_summary_report self._eval_result_output_dir = eval_result_output_dir self._runner = runner self._case_parallelism = case_parallelism @@ -124,6 +144,7 @@ async def _run(self) -> None: num_runs = self._num_runs agent_name = self._agent_name print_detailed_results = self._print_detailed_results + print_summary_report = self._print_summary_report eval_result_output_dir = self._eval_result_output_dir runner = self._runner case_parallelism = self._case_parallelism @@ -189,7 +210,7 @@ async def _run(self) -> None: eval_results_by_eval_id=eval_results_by_eval_id, num_runs=num_runs_for_set, ) - if all_details or all_results: + if print_summary_report and (all_details or all_results): _RESULT_HANDLER.print_evaluation_report( all_details=all_details, all_results=all_results, @@ -207,7 +228,7 @@ async def _run(self) -> None: indent=2, ensure_ascii=False, ) - assert False, combined + raise _EvaluationCasesFailed(combined) async def _ensure_run(self) -> None: if self._task is None: @@ -354,6 +375,7 @@ def get_executer( case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, eval_metrics_file_path_or_dir: Optional[str] = None, + print_summary_report: bool = True, ) -> _EvalExecuter: """Return an executer (does not run). Await executer.evaluate() then executer.get_result() for result. @@ -377,6 +399,12 @@ def get_executer( evaluation config JSON (file) or directory containing a single config JSON. When provided, overrides the dataset-local ``test_config.json`` convention for ALL discovered datasets. + print_summary_report: When False, suppress the Execution Details and + Evaluation Result tables normally printed at the end of a run. + The result is still computed and returned by ``get_result()``. + Defaults to True for direct callers; tools that drive the + evaluator inside a larger workflow (e.g. ``AgentOptimizer``) + pass False to keep their own output unmixed. Returns: _EvalExecuter: Await .evaluate() to run, then .get_result() for EvaluateResult. @@ -394,6 +422,7 @@ def get_executer( case_eval_parallelism=case_eval_parallelism, callbacks=callbacks, eval_metrics_file_path_or_dir=eval_metrics_file_path_or_dir, + print_summary_report=print_summary_report, ) @staticmethod diff --git a/trpc_agent_sdk/evaluation/_agent_optimizer.py b/trpc_agent_sdk/evaluation/_agent_optimizer.py new file mode 100644 index 0000000..16510e7 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_agent_optimizer.py @@ -0,0 +1,614 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""AgentOptimizer: business-facing entry point for prompt optimization. + +Mirrors :class:`AgentEvaluator`: business code calls +``AgentOptimizer.optimize(...)`` and the facade dispatches to the +algorithm registered under ``config.optimize.algorithm.name`` (looked +up in :data:`OPTIMIZER_REGISTRY`). Switching algorithms is a +single-field config change. +""" + +from __future__ import annotations + +import inspect +import logging +import os +import signal +import sys +import threading +import warnings +from datetime import datetime +from datetime import timezone +from pathlib import Path +from typing import Any +from typing import Optional +from typing import Sequence + +from ._eval_callbacks import Callbacks +from ._optimize_config import OptimizeConfigFile +from ._optimize_config import load_optimize_config +from ._optimize_registry import OPTIMIZER_REGISTRY +from ._optimize_reporter import RunHeader +from ._optimize_reporter import create_reporter +from ._optimize_result import OptimizeResult +from ._remote_eval_service import CallAgent +from ._target_prompt import TargetPrompt + +# Metrics incompatible with call_agent (black-box) mode because their +# evaluators need data RemoteEvalService doesn't capture: +# - ``tool_trajectory_avg_score``: per-step tool call traces. +# - ``llm_rubric_knowledge_recall``: tool responses from +# ``Invocation.intermediate_data`` (RemoteEvalService leaves it None, +# so the judge would always see "No knowledge search results were +# found." for every case). +_DISALLOWED_METRICS_IN_CALL_AGENT_MODE = frozenset({ + "tool_trajectory_avg_score", + "llm_rubric_knowledge_recall", +}) + +_PROMPT_FILE_LOGGER = logging.getLogger("trpc_agent_sdk.optimizer") + + +def _atomic_write_text(path: str, content: str) -> None: + """Atomically replace ``path`` with ``content`` (UTF-8). + + Writes to a sibling ``.tmp`` then ``os.replace`` to swap into + place — POSIX guarantees rename is atomic, so a process kill or + power loss between the write and the rename leaves ``path`` either + pristine (pre-call content, or missing if it did not exist) or + fully updated, never half-written. Mirrors + :meth:`TargetPrompt._atomic_write_path` so artifact persistence + enjoys the same crash safety as source rollback. + """ + tmp = path + ".tmp" + Path(tmp).write_text(content, encoding="utf-8") + os.replace(tmp, path) + + +class _mask_sigint: + """Context manager that masks SIGINT for the duration of the block. + + Used by :meth:`AgentOptimizer._persist_artifacts` so a panicked second + Ctrl+C during teardown cannot interrupt artifact writes between + ``os.replace`` boundaries. Restores the previous handler on exit even + if the block raises. On platforms / threads where ``signal.signal`` + is unavailable (Windows, non-main thread) the context degrades to a + no-op rather than crashing — the underlying ``_atomic_write_text`` is + still crash-safe; only the second-Ctrl+C-during-finally race + protection is foregone. + """ + + def __init__(self) -> None: + self._previous = None + self._installed = False + + def __enter__(self) -> "_mask_sigint": + # signal.signal() only works in the main thread of the main interpreter. + if threading.current_thread() is not threading.main_thread(): + return self + try: + self._previous = signal.signal(signal.SIGINT, signal.SIG_IGN) + self._installed = True + except (ValueError, OSError): # pragma: no cover - platform fallback + # ValueError: not main thread on some platforms; OSError: signal + # not supported (rare embedded interpreters). Either way, leave + # SIGINT as-is; persistence is still best-effort. + self._installed = False + return self + + def __exit__(self, exc_type, exc, tb) -> None: + if not self._installed: + return + try: + signal.signal(signal.SIGINT, self._previous) + except (ValueError, OSError): # pragma: no cover - platform fallback + pass + + +class AgentOptimizer: + """Business-facing entry point dispatching to the registered algorithm. + + Business code passes a config file path; the facade reads + validates + it, looks up the algorithm class from + :data:`OPTIMIZER_REGISTRY` by ``config.optimize.algorithm.name``, + instantiates it, and runs the loop. + + Example: + target = TargetPrompt().add_path("system_prompt", "prompts/system.md") + result = await AgentOptimizer.optimize( + config_path="optimizer.json", + call_agent=my_call_agent, + target_prompt=target, + train_dataset_path="data/train.evalset.json", + validation_dataset_path="data/val.evalset.json", + output_dir="runs/2026-05-17T16-30-00", + ) + """ + + @classmethod + async def optimize( + cls, + *, + config_path: str, + call_agent: CallAgent, + target_prompt: TargetPrompt, + train_dataset_path: str, + validation_dataset_path: str, + output_dir: str, + callbacks: Optional[Callbacks] = None, + update_source: bool = False, + verbose: int = 1, + extra_stop_callbacks: Optional[Sequence[Any]] = None, + extra_gepa_callbacks: Optional[Sequence[Any]] = None, + ) -> OptimizeResult: + """Load the config file at ``config_path`` and run the selected algorithm. + + Args: + config_path: Path to the optimizer JSON config file. + call_agent: Async callable mapping a user query to an agent response. + target_prompt: Registry of prompt fields to optimize. + train_dataset_path: Path to the training eval set file. + validation_dataset_path: Path to the validation eval set file (must + differ from ``train_dataset_path``). + output_dir: Required artifact directory. The facade creates it when + missing and persists ``result.json``, ``summary.txt``, + ``rounds/`` records, ``baseline_prompts/`` and ``best_prompts/`` + directories, a ``config.snapshot.json`` copy of the input + config, and a ``run.log`` summary line. + callbacks: Optional evaluator lifecycle callbacks. + update_source: When True, persist the best candidate back to + every registered TargetPrompt field after a SUCCEEDED + run; when False (default), source files keep their + baseline content. ``OptimizeResult.best_prompts`` always + carries the best text regardless, so callers can review + the proposal before deciding to write back. + verbose: Reporter verbosity. ``0`` suppresses terminal + output (artifact persistence still happens). ``1`` + (default): Rich panel header + per-round line + closing + summary, falling back to ASCII when ``rich`` is missing. + ``2`` adds gepa-internal log forwarding on the + ``trpc_agent_sdk.optimizer.gepa`` logger. + extra_stop_callbacks: Runtime-only stoppers appended after + gepa-native stoppers. Useful for SLO monitors / kill + switches. Plain callables surface as + ``stop_reason="completed"``; wrap in + ``_LabeledStopper`` (or expose a ``.label`` attribute + matching :data:`StopReason`) for a stable classification. + extra_gepa_callbacks: Runtime-only gepa event callbacks + appended after the framework's built-in + ``_AgentGEPACallback`` (e.g. forwarding events to a + dashboard). Each entry should implement the + ``gepa.core.callback.GEPACallback`` protocol; gepa + silently ignores callbacks missing a method it invokes. + + Raises: + FileNotFoundError: if ``config_path`` does not exist. + pydantic.ValidationError: if the config violates schema constraints. + ValueError: if ``optimize`` section is missing; if the requested + ``algorithm.name`` is not registered (message lists every + algorithm currently in ``OPTIMIZER_REGISTRY.list_registered()``); + if ``target_prompt`` has no registered fields; if a metric + requiring session traces is configured under call_agent mode; or + if ``train_dataset_path`` and ``validation_dataset_path`` resolve + to the same file (train-test leakage guard). + TypeError: if ``call_agent`` is not an ``async`` callable. + """ + cls._precheck_algorithm_name(config_path) + config = load_optimize_config(config_path) + cls._validate_inputs( + config=config, + call_agent=call_agent, + target_prompt=target_prompt, + train_dataset_path=train_dataset_path, + validation_dataset_path=validation_dataset_path, + output_dir=output_dir, + ) + os.makedirs(output_dir, exist_ok=True) + + algorithm_name = config.optimize.algorithm.name + algorithm_cls = OPTIMIZER_REGISTRY.get(algorithm_name) + optimizer = algorithm_cls( + config=config, + call_agent=call_agent, + target_prompt=target_prompt, + train_dataset_path=train_dataset_path, + validation_dataset_path=validation_dataset_path, + callbacks=callbacks, + output_dir=output_dir, + extra_stop_callbacks=extra_stop_callbacks, + extra_gepa_callbacks=extra_gepa_callbacks, + ) + + reporter = create_reporter(verbose=verbose, stream=sys.stdout) + baseline_snapshot = await target_prompt.read_all() + header = cls._build_run_header( + algorithm=algorithm_name, + target_prompt=target_prompt, + config=config, + train_dataset_path=train_dataset_path, + validation_dataset_path=validation_dataset_path, + output_dir=output_dir, + ) + cls._safe_reporter_call(reporter.run_started, header) + + result: Optional[OptimizeResult] = None + # ``cleanup_done`` gates whether the ``finally`` block must restore + # baseline. It flips to True after EITHER (a) write_all(best) succeeded + # (so sources already hold the desired content and no restore is + # needed) OR (b) the ``except`` branch successfully wrote baseline back + # as part of its rollback. This single sentinel guarantees baseline + # write_all is invoked at most once per optimize() — important for + # callback-backed fields whose write_fn may be non-idempotent (version + # counters, audit log entries). + cleanup_done = False + run_error: Optional[BaseException] = None + try: + try: + result = await optimizer.run(reporter=reporter) + except BaseException as ex: + run_error = ex + raise + + if update_source and result.status == "SUCCEEDED": + # write_all is atomic for path-backed sources (tmp + + # os.replace, rollback on partial failure). If it raises, + # sources may sit at an intermediate candidate from the + # last in-run evaluation — restore baseline explicitly + # then re-raise so the caller sees the write failure. + try: + await target_prompt.write_all(result.best_prompts) + cleanup_done = True + except Exception: + try: + await target_prompt.write_all(baseline_snapshot) + cleanup_done = True + except Exception: # pragma: no cover - defensive guard + pass + raise + finally: + if not cleanup_done: + # Best-effort restore: never mask the underlying run/write error. + try: + await target_prompt.write_all(baseline_snapshot) + except Exception: # pragma: no cover - defensive guard + pass + + cls._persist_artifacts( + result=result, + baseline_snapshot=baseline_snapshot, + output_dir=output_dir, + config_path=config_path, + run_error=run_error, + update_source=update_source, + ) + cls._emit_reporter_finish( + reporter=reporter, + result=result, + baseline_snapshot=baseline_snapshot, + output_dir=output_dir, + update_source=update_source, + run_error=run_error, + ) + return result + + @staticmethod + def _build_run_header( + *, + algorithm: str, + target_prompt: TargetPrompt, + config: OptimizeConfigFile, + train_dataset_path: str, + validation_dataset_path: str, + output_dir: str, + ) -> RunHeader: + """Collect the static run context surfaced in the terminal header. + + Train / val sizes are read from each EvalSet on disk so the header + reflects the actual material the algorithm will evaluate, including + edge cases where one of the sets is empty. + """ + from ._eval_set import EvalSet + from pathlib import Path + + def _count_cases(path: str) -> int: + try: + return len(EvalSet.model_validate_json(Path(path).read_text(encoding="utf-8")).eval_cases) + except Exception: + return 0 + + target_fields: list[tuple[str, str]] = [] + for name in target_prompt.names(): + target_fields.append((name, target_prompt.describe_source(name))) + + metric_names = [metric.metric_name for metric in config.evaluate.get_eval_metrics()] + budget_total = getattr(config.optimize.algorithm, "max_metric_calls", None) + return RunHeader( + algorithm=algorithm, + target_fields=target_fields, + train_size=_count_cases(train_dataset_path), + val_size=_count_cases(validation_dataset_path), + metric_names=metric_names, + output_dir=output_dir, + budget_total=budget_total, + ) + + @staticmethod + def _safe_reporter_call(fn, *args, **kwargs) -> None: + """Invoke a reporter method, swallowing render errors.""" + try: + fn(*args, **kwargs) + except Exception: # pragma: no cover - reporter must never break the loop + _PROMPT_FILE_LOGGER.warning("reporter event failed", exc_info=True) + + @classmethod + def _emit_reporter_finish( + cls, + *, + reporter, + result: Optional[OptimizeResult], + baseline_snapshot: dict[str, str], + output_dir: str, + update_source: bool, + run_error: Optional[BaseException], + ) -> None: + if result is not None: + cls._safe_reporter_call( + reporter.run_finished, + result, + output_dir=output_dir, + update_source=update_source, + ) + return + message = (str(run_error) if run_error is not None else "optimization failed") + cls._safe_reporter_call( + reporter.run_failed, + baseline_prompts=dict(baseline_snapshot), + output_dir=output_dir, + error_message=message, + ) + + @classmethod + def _persist_artifacts( + cls, + *, + result: Optional[OptimizeResult], + baseline_snapshot: dict[str, str], + output_dir: str, + config_path: str, + run_error: Optional[BaseException], + update_source: bool, + ) -> None: + """Write run artifacts under ``output_dir``. + + Layout: + - ``result.json`` Full OptimizeResult JSON. + - ``summary.txt`` Human-readable summary. + - ``rounds/round_.json`` One file per RoundRecord. + - ``baseline_prompts/.md`` Pre-run snapshot of every + TargetPrompt field + (regardless of update_source). + - ``best_prompts/.md`` Best candidate per field + (only when a result was produced). + - ``config.snapshot.json`` Copy of the input config. + - ``run.log`` One-line status footer. + + SIGINT (Ctrl+C) is masked for the duration of this method so a + second Ctrl+C during persistence cannot leave half-written + artifacts. All files are written atomically (tmp + os.replace), + so even if SIGKILL or a power loss interrupts the process the + output_dir never contains a partially-written file (only a + ``.tmp`` sibling that the next run can ignore). Missing pieces + (e.g. ``best_prompts`` on early failure) are silently omitted. + """ + with _mask_sigint(): + cls._write_baseline_prompts(baseline_snapshot, output_dir) + cls._copy_config_snapshot(config_path, output_dir) + + if result is None: + cls._write_run_log( + output_dir=output_dir, + line=cls._render_failure_log_line(run_error), + ) + return + + try: + _atomic_write_text( + os.path.join(output_dir, "result.json"), + result.model_dump_json(indent=2, by_alias=True), + ) + except Exception: # pragma: no cover - defensive guard for write errors + _PROMPT_FILE_LOGGER.warning("failed to write result.json", exc_info=True) + + try: + summary_text = result.format_summary(output_dir=output_dir, update_source=update_source) + _atomic_write_text(os.path.join(output_dir, "summary.txt"), summary_text) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write summary.txt", exc_info=True) + + cls._write_rounds_directory(result, output_dir) + cls._write_best_prompts(result, output_dir) + cls._write_run_log( + output_dir=output_dir, + line=cls._render_success_log_line(result), + ) + + @staticmethod + def _write_baseline_prompts(baseline_snapshot: dict[str, str], output_dir: str) -> None: + baseline_dir = os.path.join(output_dir, "baseline_prompts") + os.makedirs(baseline_dir, exist_ok=True) + for name, content in baseline_snapshot.items(): + path = os.path.join(baseline_dir, f"{name}.md") + try: + _atomic_write_text(path, content) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write baseline prompt %s", name, exc_info=True) + + @staticmethod + def _write_best_prompts(result: OptimizeResult, output_dir: str) -> None: + best_dir = os.path.join(output_dir, "best_prompts") + os.makedirs(best_dir, exist_ok=True) + for name, content in result.best_prompts.items(): + path = os.path.join(best_dir, f"{name}.md") + try: + _atomic_write_text(path, content) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write best prompt %s", name, exc_info=True) + + @staticmethod + def _write_rounds_directory(result: OptimizeResult, output_dir: str) -> None: + rounds_dir = os.path.join(output_dir, "rounds") + os.makedirs(rounds_dir, exist_ok=True) + for record in result.rounds: + path = os.path.join(rounds_dir, f"round_{record.round:03d}.json") + try: + _atomic_write_text(path, record.model_dump_json(indent=2, by_alias=True)) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write round %s", record.round, exc_info=True) + + @staticmethod + def _copy_config_snapshot(config_path: str, output_dir: str) -> None: + target = os.path.join(output_dir, "config.snapshot.json") + try: + # Read + atomic-write rather than shutil.copyfile so an interrupted + # copy cannot leave a half-written ``config.snapshot.json``. + content = Path(config_path).read_text(encoding="utf-8") + _atomic_write_text(target, content) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to copy config snapshot", exc_info=True) + + @staticmethod + def _write_run_log(*, output_dir: str, line: str) -> None: + try: + _atomic_write_text( + os.path.join(output_dir, "run.log"), + line.rstrip("\n") + "\n", + ) + except Exception: # pragma: no cover + _PROMPT_FILE_LOGGER.warning("failed to write run.log", exc_info=True) + + @staticmethod + def _render_success_log_line(result: OptimizeResult) -> str: + return (f"{datetime.now(timezone.utc).isoformat()} status={result.status} " + f"algorithm={result.algorithm} " + f"baseline={result.baseline_pass_rate:.4f} " + f"best={result.best_pass_rate:.4f} " + f"delta={result.pass_rate_improvement:+.4f} " + f"rounds={result.total_rounds} " + f"duration_seconds={result.duration_seconds:.2f}") + + @staticmethod + def _render_failure_log_line(run_error: Optional[BaseException]) -> str: + msg = str(run_error) if run_error else "optimization failed before result" + return (f"{datetime.now(timezone.utc).isoformat()} status=FAILED " + f"error={msg!r}") + + @staticmethod + def _precheck_algorithm_name(config_path: str) -> None: + """Friendly fail-fast when ``algorithm.name`` is unknown. + + ``GepaReflectiveAlgo.name`` is declared as ``Literal["gepa_reflective"]`` + for future pydantic-discriminator-based union routing. The Literal + causes pydantic to reject unknown names with a ``literal_error`` that + does not list available algorithms. We pre-read the raw JSON, look up + ``algorithm.name`` against ``OPTIMIZER_REGISTRY``, and raise a + ``ValueError`` listing every registered algorithm before pydantic's + Literal check fires. If parsing fails or the field is absent we let + pydantic's normal error path handle it (so we do not duplicate + formatting errors). + """ + import json + + try: + with open(config_path, "r", encoding="utf-8") as f: + raw = json.load(f) + except (OSError, json.JSONDecodeError): + return # let pydantic / load_optimize_config surface the real cause + + try: + name = raw["optimize"]["algorithm"]["name"] + except (KeyError, TypeError): + return # malformed shape: pydantic will raise a structured error + + if not isinstance(name, str): + return # type error: let pydantic's normal validation handle it + + registered = OPTIMIZER_REGISTRY.list_registered() + if name not in registered: + raise ValueError(f"No optimizer registered for algorithm: {name!r}. " + f"Available algorithms: {registered}") + + @staticmethod + def _validate_inputs( + *, + config, + call_agent: CallAgent, + target_prompt: TargetPrompt, + train_dataset_path: str, + validation_dataset_path: str, + output_dir: str, + ) -> None: + """Startup-time fail-fast checks. + + Reports actionable error messages so misconfigurations surface before + any LLM call is made. + """ + if not output_dir or not isinstance(output_dir, str): + raise ValueError("output_dir is required and must be a non-empty path; " + "pass output_dir='runs/' or similar.") + + if not target_prompt.names(): + raise ValueError("TargetPrompt has no registered fields; " + "call .add_path(...) or .add_callback(...) before optimize().") + + # Accept async functions and partials wrapping a coroutine function. + is_async = inspect.iscoroutinefunction(call_agent) + if not is_async: + wrapped = getattr(call_agent, "__wrapped__", None) + is_async = wrapped is not None and inspect.iscoroutinefunction(wrapped) + if not is_async: + raise TypeError("call_agent must be an async callable (async def or " + "Callable returning Awaitable[str]); " + f"got {type(call_agent).__name__}.") + + # Normalize so trivially-different strings ('./x', 'x') still collide + # when they resolve to the same file (train-validation leakage guard). + train_norm = os.path.normpath(os.path.abspath(train_dataset_path)) + val_norm = os.path.normpath(os.path.abspath(validation_dataset_path)) + if train_norm == val_norm: + raise ValueError("train_dataset_path and validation_dataset_path resolve to the " + f"same file ({train_norm}); use distinct datasets to avoid " + "train-validation leakage.") + + # call_agent (black-box) mode can't supply session traces or + # tool intermediate_data. ``get_eval_metrics()`` normalizes both + # 'metrics' and 'criteria' encodings so this check is uniform. + for metric in config.evaluate.get_eval_metrics(): + if metric.metric_name in _DISALLOWED_METRICS_IN_CALL_AGENT_MODE: + raise ValueError(f"Metric '{metric.metric_name}' requires session " + "traces or tool intermediate data, which call_agent " + "(black-box) mode does not capture; remove it from " + "evaluate.metrics or switch to a response-based metric " + "(e.g. final_response_avg_score, llm_rubric_response, " + "llm_final_response).") + + # gepa merge degenerates to "pick one of two parents" with a single + # component, never producing new candidates. Warn instead of error + # so existing benign configs keep running; user gets a clear hint + # that merge_rounds_total will be 0. + algo = config.optimize.algorithm + if (getattr(algo, "name", None) == "gepa_reflective" and getattr(algo, "use_merge", False) + and len(target_prompt.names()) < 2): + warnings.warn( + "use_merge=true requires TargetPrompt to register at least 2 " + "fields. With a single field, gepa merge degenerates to " + "picking one of the two parents and never creates new " + "candidates (merge_rounds_total stays 0). Set use_merge=false " + "or register more prompt fields. See " + "examples/optimization/advanced_strategies/README.md §6.1.", + UserWarning, + stacklevel=2, + ) diff --git a/trpc_agent_sdk/evaluation/_base_optimizer.py b/trpc_agent_sdk/evaluation/_base_optimizer.py new file mode 100644 index 0000000..6d79027 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_base_optimizer.py @@ -0,0 +1,123 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Abstract base class for prompt optimization algorithms.""" + +from __future__ import annotations + +from abc import ABC +from abc import abstractmethod +from typing import TYPE_CHECKING +from typing import Any +from typing import Optional +from typing import Sequence + +from ._eval_callbacks import Callbacks +from ._optimize_config import FrameworkStopConfig +from ._optimize_config import OptimizeConfigFile +from ._optimize_result import OptimizeResult +from ._remote_eval_service import CallAgent +from ._target_prompt import TargetPrompt + +if TYPE_CHECKING: + from ._optimize_reporter import OptimizeReporter + + +class BaseOptimizer(ABC): + """Abstract base class for prompt optimization algorithms. + + Subclasses implement `run()` to execute one full optimization loop + against the supplied config, evaluator inputs, and TargetPrompt. + """ + + def __init__( + self, + *, + config: OptimizeConfigFile, + call_agent: CallAgent, + target_prompt: TargetPrompt, + train_dataset_path: str, + validation_dataset_path: str, + callbacks: Optional[Callbacks] = None, + output_dir: Optional[str] = None, + extra_stop_callbacks: Optional[Sequence[Any]] = None, + extra_gepa_callbacks: Optional[Sequence[Any]] = None, + ) -> None: + self.config = config + self.call_agent = call_agent + self.target_prompt = target_prompt + self.train_dataset_path = train_dataset_path + self.validation_dataset_path = validation_dataset_path + self.callbacks = callbacks + self.output_dir = output_dir + # Runtime-only hooks are not part of the JSON config schema + # because they're Python callables (SLO monitors, kill switches, + # custom telemetry sinks) whose identity is meaningful and + # cannot be serialised. Plain stoppers surface a generic + # ``"completed"`` stop_reason unless wrapped in + # ``_LabeledStopper``. + self.extra_stop_callbacks: list[Any] = (list(extra_stop_callbacks) if extra_stop_callbacks else []) + self.extra_gepa_callbacks: list[Any] = (list(extra_gepa_callbacks) if extra_gepa_callbacks else []) + + @abstractmethod + async def run( + self, + *, + reporter: Optional["OptimizeReporter"] = None, + ) -> OptimizeResult: + """Execute the optimization loop and return the final OptimizeResult. + + Args: + reporter: Progress sink for ``baseline_evaluated`` and + ``round_completed`` events. The facade always supplies + a non-None instance (``_NullReporter`` when + ``verbose=0``); subclasses may treat ``None`` as a noop + for direct invocations. + """ + + @staticmethod + def resolve_required_thresholds( + stop_config: FrameworkStopConfig, + metric_thresholds: dict[str, float], + ) -> dict[str, float]: + """Return the subset of thresholds the framework stop policy enforces. + + Resolution rules: + - ``required_metrics`` is None or empty list → ``{}`` (disabled). + - ``required_metrics == "all"`` → copy of all thresholds. + - non-empty list → ``metric_thresholds`` + filtered to listed names. Unknown names are silently dropped + (cross-field validation on :class:`OptimizeConfigFile` + already rejects them at config load time). + + Algorithms call this once per run and feed the result to + :meth:`metrics_meet_thresholds`. + """ + required = stop_config.required_metrics + if required is None: + return {} + if isinstance(required, list): + if not required: + return {} + allowed = set(required) + return {name: thr for name, thr in metric_thresholds.items() if name in allowed} + return dict(metric_thresholds) + + @staticmethod + def metrics_meet_thresholds( + metric_breakdown: dict[str, float], + required_thresholds: dict[str, float], + ) -> bool: + """True iff every required metric meets its threshold. + + Returns ``False`` when ``required_thresholds`` is empty so the + policy is a no-op when nothing is required. Callers obtain + ``required_thresholds`` from :meth:`resolve_required_thresholds` + for consistent "all / list / None / empty" semantics. + """ + if not required_thresholds: + return False + return all( + metric_breakdown.get(name, float("-inf")) >= threshold for name, threshold in required_thresholds.items()) diff --git a/trpc_agent_sdk/evaluation/_optimize_config.py b/trpc_agent_sdk/evaluation/_optimize_config.py new file mode 100644 index 0000000..3154712 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_config.py @@ -0,0 +1,257 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Optimizer configuration schema. + +Each registered algorithm contributes a pydantic model under +``OptimizeConfig.algorithm``; field names mirror the upstream library +(e.g. https://github.com/gepa-ai/gepa) 1:1 so users can cross-reference +upstream docs without translating. + +The top-level ``optimize`` section only carries algorithm-agnostic +switches (e.g. evaluator parallelism, framework stop policies); any +switch whose effect depends on the selected algorithm lives inside the +algorithm block. +""" + +from __future__ import annotations + +from typing import Literal +from typing import Optional +from typing import Union + +from pydantic import Field +from pydantic import model_validator + +from ._common import EvalBaseModel +from ._eval_config import EvalConfig +from ._optimize_model_options import OptimizeModelOptions + + +class GepaReflectiveAlgo(EvalBaseModel): + """gepa_reflective algorithm configuration. + + Field names mirror ``gepa.optimize`` parameters and gepa + ``StopperProtocol`` constructor arguments so config maps to gepa + docs directly. + """ + + name: Literal["gepa_reflective"] = Field(description="Algorithm discriminator tag.", ) + + seed: int = Field( + default=42, + description="Random seed forwarded to gepa.optimize(seed=...).", + ) + reflection_lm: OptimizeModelOptions = Field( + description=("LLM gepa uses to reflect on failed cases and propose new prompts. " + "Forwarded to gepa.optimize(reflection_lm=...)."), ) + + candidate_selection_strategy: Literal[ + "pareto", + "current_best", + "epsilon_greedy", + "top_k_pareto", + ] = Field( + default="pareto", + description="Strategy gepa uses to pick the parent candidate each round.", + ) + module_selector: str = Field( + default="round_robin", + description="Component selector passed to gepa (e.g. 'round_robin', 'all').", + ) + frontier_type: Literal["instance", "objective", "hybrid", "cartesian"] = Field( + default="instance", + description="Pareto frontier tracking granularity forwarded to gepa.", + ) + reflection_minibatch_size: Optional[int] = Field( + default=None, + description="Per-round minibatch size for the reflective dataset; None lets gepa decide.", + ) + reflection_history_top_k: int = Field( + default=2, + ge=0, + le=5, + description=("How many historical best traces per case to expose to the " + "reflection LM as the ``history_top_k`` record field. 0 " + "disables the feature. Capped at 5 to bound prompt-token " + "growth — for K=2 a typical multi-turn case grows ~30%."), + ) + perfect_score: float = Field( + default=1.0, + description="Score considered 'perfect' for skip_perfect_score decisions.", + ) + skip_perfect_score: bool = Field( + default=True, + description="Whether gepa skips optimizing instances that already score perfect.", + ) + + use_merge: bool = Field( + default=False, + description="Whether to enable gepa merge-based candidate proposals.", + ) + max_merge_invocations: int = Field( + default=5, + description="Maximum merge invocations when use_merge is true.", + ) + merge_val_overlap_floor: int = Field( + default=5, + description="Minimum shared validation ids required before attempting a merge subsample.", + ) + + cache_evaluation: bool = Field( + default=False, + description="Cache (candidate, case) scores so repeated evaluations skip the metric call.", + ) + track_best_outputs: bool = Field( + default=False, + description="Track per-case best outputs alongside the best candidate.", + ) + + max_metric_calls: Optional[int] = Field( + default=None, + description=("Stop after this many metric calls (one metric call = one case-level " + "evaluation). Mapped to gepa MaxMetricCallsStopper. At least one of the " + "five stop conditions on this object must be set."), + ) + max_iterations_without_improvement: Optional[int] = Field( + default=None, + description=("Stop after this many consecutive iterations whose best valset score " + "did not improve. Mapped to gepa NoImprovementStopper."), + ) + timeout_seconds: Optional[float] = Field( + default=None, + description=("Stop after this many wall-clock seconds. Mapped to gepa " + "TimeoutStopCondition."), + ) + score_threshold: Optional[float] = Field( + default=None, + description=("Stop once the best valset score reaches this threshold. Mapped to " + "gepa ScoreThresholdStopper."), + ) + max_candidate_proposals: Optional[int] = Field( + default=None, + description=("Stop after this many candidate proposals. Mapped to gepa " + "MaxCandidateProposalsStopper."), + ) + max_tracked_candidates: Optional[int] = Field( + default=None, + description=("Stop once the candidate pool reaches this size. Mapped to gepa " + "MaxTrackedCandidatesStopper."), + ) + + @model_validator(mode="after") + def _require_at_least_one_stop_condition(self) -> "GepaReflectiveAlgo": + if not any(value is not None for value in ( + self.max_metric_calls, + self.max_iterations_without_improvement, + self.timeout_seconds, + self.score_threshold, + self.max_candidate_proposals, + self.max_tracked_candidates, + )): + raise ValueError("gepa_reflective requires at least one stop condition: set one of " + "max_metric_calls / max_iterations_without_improvement / " + "timeout_seconds / score_threshold / max_candidate_proposals / " + "max_tracked_candidates.") + return self + + +class FrameworkStopConfig(EvalBaseModel): + """Framework-level stop policies applied to every algorithm. + + Today the only such policy is metric-based early stopping: stop + when every metric named by ``required_metrics`` meets its threshold + on the validation set. Threshold values come from + ``evaluate.metrics[].threshold``; this section only decides which + metrics participate. + + Pass-rate-based stopping is not exposed here because every supported + engine has an equivalent native field (e.g. ``algorithm.score_threshold`` + for gepa_reflective). + + Field values for ``required_metrics``: + - ``"all"`` (default): every metric in ``evaluate.metrics[]`` + must meet its threshold. + - ``list[str]``: only the listed metrics must meet thresholds. + Each name must match an entry in + ``evaluate.metrics[].metric_name`` (validated by + :class:`OptimizeConfigFile`). Empty list disables the policy. + - ``None``: disable the policy entirely; the run finishes only + via algorithm-native stop conditions. + """ + + required_metrics: Optional[Union[Literal["all"], list[str]]] = Field( + default="all", + description=("Metrics whose thresholds must be met on the validation set " + "before the framework asks the algorithm to stop. 'all' means " + "every metric in evaluate.metrics[]; a list narrows the set; " + "None or [] disables the policy."), + ) + + +class OptimizeConfig(EvalBaseModel): + """Algorithm-agnostic optimizer section. + + Holds switches the framework itself consumes; algorithm-specific + knobs live under :attr:`algorithm` so different algorithms can + expose entirely different field sets without polluting one another. + + To add a second algorithm: + 1. Define ``MyAlgo(EvalBaseModel)`` with ``name: Literal["my_algo"]``. + 2. Replace :attr:`algorithm` type with:: + + algorithm: Annotated[ + Union[GepaReflectiveAlgo, MyAlgo], + Field(discriminator="name"), + ] + + pydantic v2 then routes validation by the ``name`` tag and + rejects unknown algorithm names with a clear error. + """ + + eval_case_parallelism: int = Field( + default=4, + description="Case-level parallelism forwarded to the evaluator.", + ) + stop: FrameworkStopConfig = Field( + default_factory=FrameworkStopConfig, + description=("Framework-level stop policies; OR'd with any algorithm-native " + "stop conditions configured under :attr:`algorithm`."), + ) + algorithm: GepaReflectiveAlgo = Field(description="Algorithm selection and algorithm-specific parameters.", ) + + +class OptimizeConfigFile(EvalBaseModel): + """Top-level schema for an optimizer JSON config file.""" + + evaluate: EvalConfig = Field(description="Evaluator section: same schema as evaluator's EvalConfig.", ) + optimize: OptimizeConfig = Field(description="Optimizer section: framework switches plus the algorithm block.", ) + + @model_validator(mode="after") + def _validate_required_metrics_against_evaluate(self) -> "OptimizeConfigFile": + required = self.optimize.stop.required_metrics + if not isinstance(required, list) or not required: + return self + available = {metric.metric_name for metric in self.evaluate.get_eval_metrics()} + unknown = [name for name in required if name not in available] + if unknown: + raise ValueError("stop.required_metrics references unknown metric(s) " + f"{unknown}; available metrics from evaluate.metrics[]: " + f"{sorted(available)}") + return self + + +def load_optimize_config(path: str) -> OptimizeConfigFile: + """Load and parse an optimizer JSON config file. + + Accepts camelCase and snake_case keys. + + Raises: + FileNotFoundError: if path does not exist. + pydantic.ValidationError: on schema violations. + """ + with open(path, "r", encoding="utf-8") as f: + content = f.read() + return OptimizeConfigFile.model_validate_json(content) diff --git a/trpc_agent_sdk/evaluation/_optimize_evaluator_call.py b/trpc_agent_sdk/evaluation/_optimize_evaluator_call.py new file mode 100644 index 0000000..73b4614 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_evaluator_call.py @@ -0,0 +1,136 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Optimizer-facing wrapper around AgentEvaluator.""" + +from __future__ import annotations + +from dataclasses import dataclass +from dataclasses import field +from statistics import mean +from typing import Optional + +from ._agent_evaluator import AgentEvaluator +from ._agent_evaluator import _EvaluationCasesFailed +from ._eval_callbacks import Callbacks +from ._eval_metrics import EvalStatus +from ._eval_result import EvaluateResult +from ._remote_eval_service import CallAgent + + +@dataclass(frozen=True) +class EvaluationOutcome: + """Summary metrics extracted from an EvaluateResult for the optimizer. + + Attributes: + pass_rate: Fraction of cases whose final_eval_status is PASSED. + tiebreaker: Mean of all per-case metric scores; used when pass_rate ties. + metric_breakdown: Mean score per metric name across all cases. + failed_case_ids: Eval ids of cases that did not pass; duplicated across runs. + judge_model_calls: Currently always 0; the evaluator does not surface per-judge invocation counts. + raw_result: The original EvaluateResult for downstream inspection. + """ + + pass_rate: float + tiebreaker: float + metric_breakdown: dict[str, float] = field(default_factory=dict) + failed_case_ids: list[str] = field(default_factory=list) + judge_model_calls: int = 0 + raw_result: Optional[EvaluateResult] = None + + +def summarize_outcome(result: EvaluateResult) -> EvaluationOutcome: + """Reduce a raw EvaluateResult to the metrics the optimizer needs. + + judge_model_calls is set to 0 here; remote evaluators may overwrite it + after the call returns when actual judge invocation counts are known. + """ + total = 0 + passed = 0 + failed_case_ids: list[str] = [] + scores_by_metric: dict[str, list[float]] = {} + + for set_result in result.results_by_eval_set_id.values(): + for eval_id, runs in set_result.eval_results_by_eval_id.items(): + for run in runs: + total += 1 + if run.final_eval_status == EvalStatus.PASSED: + passed += 1 + else: + failed_case_ids.append(eval_id) + for metric in run.overall_eval_metric_results: + if metric.score is None: + continue + scores_by_metric.setdefault(metric.metric_name, []).append(metric.score) + + pass_rate = passed / total if total > 0 else 0.0 + metric_breakdown = {name: mean(scores) for name, scores in scores_by_metric.items()} + all_scores = [s for scores in scores_by_metric.values() for s in scores] + tiebreaker = mean(all_scores) if all_scores else 0.0 + + return EvaluationOutcome( + pass_rate=pass_rate, + tiebreaker=tiebreaker, + metric_breakdown=metric_breakdown, + failed_case_ids=failed_case_ids, + judge_model_calls=0, + raw_result=result, + ) + + +async def run_evaluator( + *, + eval_dataset_path: str, + eval_metrics_path: Optional[str], + call_agent: CallAgent, + callbacks: Optional[Callbacks], + num_runs: int = 1, + case_parallelism: Optional[int] = None, +) -> EvaluationOutcome: + """Run the evaluator over a dataset and summarize the outcome. + + Args: + eval_dataset_path: Path to an eval set file or directory of eval sets. + eval_metrics_path: Path to a shared metrics config file; None falls back to dataset-local config. + call_agent: Async function that maps a user query to an agent response. + callbacks: Optional lifecycle callbacks passed through to the evaluator. + num_runs: Number of runs per eval set. + case_parallelism: Max concurrent cases for inference; None lets the + evaluator use its default. Plumbs ``optimize.eval_case_parallelism`` + through to :meth:`AgentEvaluator.get_executer`. + + Returns: + EvaluationOutcome with extracted pass_rate / tiebreaker / metric_breakdown / failed_case_ids. + """ + executer = AgentEvaluator.get_executer( + eval_dataset_path, + call_agent=call_agent, + callbacks=callbacks, + num_runs=num_runs, + print_detailed_results=False, + print_summary_report=False, + eval_result_output_dir=None, + eval_metrics_file_path_or_dir=eval_metrics_path, + case_parallelism=case_parallelism, + ) + # _EvaluationCasesFailed signals "some cases failed" — the evaluator has + # already populated ``executer.get_result()`` before raising, so we swallow + # this specific subclass and let the optimizer keep iterating. Any other + # exception (FileNotFoundError, network error, third-party AssertionError, + # ...) is a real failure and must propagate: silently substituting an empty + # EvaluateResult would make the optimizer see a 0.0 pass_rate and continue + # optimizing against phantom data. + try: + await executer.evaluate() + except _EvaluationCasesFailed: + pass + result = executer.get_result() + if result is None: + # _run raised before populating self._result. This only happens on a + # real upstream error (which would have re-raised above) or a logic + # bug. Return an empty outcome rather than crash, but the path is + # defensive — not a normal control-flow branch. + result = EvaluateResult() + return summarize_outcome(result) diff --git a/trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py b/trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py new file mode 100644 index 0000000..34fc4dd --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_gepa_adapter.py @@ -0,0 +1,794 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""GEPA protocol adapter and reflective-dataset builder. + +Implements ``gepa.core.adapter.GEPAAdapter`` so gepa's main loop can +drive evaluation through the framework's ``AgentEvaluator``. The +adapter stays decoupled from any specific gepa algorithm class so +gepa-family algorithms can reuse it without duplicating evaluator I/O. + +:meth:`_AgentGEPAAdapter.make_reflective_dataset` renders each failed +case into a turn-sliced markdown record +(``{case_id, score, "Case Body", "Other Active Components"?}``) tuned +for the reflection LM in multi-component / multi-turn / multi-run / +tool-using scenarios. + +``gepa`` is an optional dependency: ``EvaluationBatch`` is imported +lazily inside :meth:`_AgentGEPAAdapter.evaluate`, so importing this +module without ``gepa`` installed succeeds but ``evaluate`` then fails +fast. +""" + +from __future__ import annotations + +import asyncio +import tempfile +import uuid +from pathlib import Path +from typing import Any +from typing import Mapping +from typing import Optional +from typing import Sequence + +from ._eval_callbacks import Callbacks +from ._eval_case import EvalCase +from ._eval_case import Invocation +from ._eval_case import get_all_tool_calls +from ._eval_case import get_all_tool_responses +from ._eval_config import EvalConfig +from ._eval_metrics import EvalStatus +from ._eval_metrics import PrebuiltMetrics +from ._eval_result import EvalCaseResult +from ._eval_result import EvalMetricResult +from ._eval_result import EvaluateResult +from ._eval_set import EvalSet +from ._optimize_evaluator_call import run_evaluator +from ._remote_eval_service import CallAgent +from ._target_prompt import TargetPrompt + + +def _extract_case_output(case_result: EvalCaseResult) -> str: + """Return the agent's final response text from the first per-invocation entry. + + Used to populate ``EvaluationBatch.outputs`` — GEPA reads that field + directly to decide whether a candidate's behaviour improved on a case + even before consulting the trajectory or score. + """ + per_inv = case_result.eval_metric_result_per_invocation or [] + if not per_inv: + return "" + actual = per_inv[0].actual_invocation + if not actual or not actual.final_response or not actual.final_response.parts: + return "" + return "\n".join((p.text or "") for p in actual.final_response.parts).strip() + + +def _invocation_text(invocation: Optional[Invocation], *, user: bool) -> str: + """Concatenate a single invocation's user_content or final_response text.""" + if invocation is None: + return "" + content = invocation.user_content if user else invocation.final_response + if content is None or not content.parts: + return "" + return "\n".join((p.text or "") for p in content.parts).strip() + + +def _render_metric_lines(metrics: Sequence[EvalMetricResult]) -> list[str]: + """Render one block of per-metric verdict lines for a turn or aggregate. + + Drives both per-invocation blocks (``### Turn N``) inside + :func:`_build_turn_block` and the case-level aggregate block + (``### Overall``) inside :func:`_build_overall_block`. Each metric + occupies one ``[PASS|FAIL] name: score=..., threshold=...`` line; + optional ``reason`` and rubric sub-score lines are nested below it. + """ + lines: list[str] = [] + for metric in metrics: + status = _format_status(metric.eval_status) + score_str = f"{metric.score:.4f}" if metric.score is not None else "n/a" + lines.append(f"[{status}] {metric.metric_name}: " + f"score={score_str}, threshold={metric.threshold:.4f}") + + # ``details.reason`` is only populated by LLM-judged evaluators. + # For deterministic matchers, synthesize a one-line explanation + # from the criterion config so the reflection LM sees WHY the + # check failed. + explicit_reason = (metric.details.reason if (metric.details and metric.details.reason) else None) + if explicit_reason: + lines.append(f" reason: {explicit_reason}") + else: + synthesized = _synthesize_failure_reason(metric) + if synthesized: + lines.append(f" reason: {synthesized}") + + # Expand rubric sub-scores so the reflection LM can target the + # precise failing aspect instead of guessing. + rubric_scores = (getattr(metric.details, "rubric_scores", None) if metric.details else None) + if rubric_scores: + for rs in rubric_scores: + rid = (getattr(rs, "id", None) if not isinstance(rs, dict) else rs.get("id")) or "?" + rscore = (getattr(rs, "score", None) if not isinstance(rs, dict) else rs.get("score")) + rreason = (getattr(rs, "reason", "") if not isinstance(rs, dict) else rs.get("reason", "")) + if rscore is None: + continue + rs_status = "PASS" if float(rscore) >= 1.0 else "FAIL" + line = (f" Ā· rubric[{rid}]: {rs_status} " + f"score={float(rscore):.2f}") + if rreason: + line += f" reason: {rreason}" + lines.append(line) + return lines + + +def _synthesize_failure_reason(metric: EvalMetricResult) -> Optional[str]: + """Synthesize a short failure explanation for deterministic metrics. + + Deterministic evaluators (e.g. ``_final_response_evaluator``) only + emit ``score`` + ``eval_status``; without this, the reflection LM + has to diff the agent's output against the reference itself to + guess why the match failed. Translate the criterion config into one + of: + + - "agent output not byte-equal to expected (case-sensitive)" (exact) + - "expected substring not contained in agent output (case-insensitive)" (contains) + - "agent output did not match expected regex" (regex) + - "JSON structural comparison failed" (json) + - "text-... AND JSON-..." when both checks are configured + + Returns ``None`` for non-deterministic metrics, currently-passing + metrics, and missing/malformed criterion configs. + """ + if metric.metric_name != PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + return None + if metric.score is None or float(metric.score) >= 1.0: + return None + criterion = metric.criterion or {} + if not isinstance(criterion, dict): + return None + fr = criterion.get("final_response") or criterion.get("finalResponse") + if not isinstance(fr, dict): + return None + + notes: list[str] = [] + text = fr.get("text") or fr.get("text_strategy") or fr.get("textStrategy") + if isinstance(text, dict) and not text.get("ignore"): + match = str(text.get("match") or text.get("match_strategy") or "exact").strip().lower() + case_ins = bool(text.get("case_insensitive") or text.get("caseInsensitive")) + case_tag = "case-insensitive" if case_ins else "case-sensitive" + if match == "exact": + notes.append(f"agent output not byte-equal to expected ({case_tag})") + elif match == "contains": + notes.append(f"expected substring not contained in agent output ({case_tag})") + elif match == "regex": + notes.append(f"agent output did not match expected regex ({case_tag})") + else: + notes.append(f"text match (mode={match}) failed ({case_tag})") + + json_cfg = fr.get("json") or fr.get("json_strategy") or fr.get("jsonStrategy") + if isinstance(json_cfg, dict) and not json_cfg.get("ignore"): + notes.append("JSON structural comparison failed") + + if not notes: + return None + return " AND ".join(notes) + + +def _format_status(status: Any) -> str: + """Render an EvalStatus as its name (PASSED/FAILED/...) — readable + to the reflection LM than the numeric ``.value``. + """ + name = getattr(status, "name", None) + if isinstance(name, str): + return name + return str(status) + + +def _per_metric_objective_scores(case_runs: Sequence[EvalCaseResult], ) -> dict[str, float]: + """Build the per-objective score map for one case. + + Each metric name maps to the mean of its ``score`` across runs. + GEPA uses this to maintain a per-objective Pareto frontier + independent of the aggregated case score — so a candidate that + dominates on one metric (e.g. rubric quality) survives even when + overall pass rates tie. Metrics with no signal across all runs are + skipped (they would taint the mean). + """ + sums: dict[str, float] = {} + counts: dict[str, int] = {} + for run in case_runs: + for metric in run.overall_eval_metric_results or []: + if metric.score is None: + continue + sums[metric.metric_name] = sums.get(metric.metric_name, 0.0) + float(metric.score) + counts[metric.metric_name] = counts.get(metric.metric_name, 0) + 1 + return {name: sums[name] / counts[name] for name in sums} + + +def _continuous_case_score(case_runs: Sequence[EvalCaseResult]) -> float: + """Compute case_score as the mean of per-metric continuous scores. + + Per run: average all ``EvalMetricResult.score`` values (each in + ``[0, 1]``). Across runs (``num_runs > 1``): average the per-run + scores. Continuous scoring lets gepa distinguish candidates that + share PASS/FAIL labels but differ in metric quality (e.g. one keeps + a rubric at 1.0 while another regresses to 0.33 — both still FAIL + overall but only one is strictly better). + """ + run_scores: list[float] = [] + for run in case_runs: + metrics = run.overall_eval_metric_results or [] + metric_scores = [float(m.score) for m in metrics if m.score is not None] + if metric_scores: + run_scores.append(sum(metric_scores) / len(metric_scores)) + else: + # Fallback to the binary PASS/FAIL signal when no per-metric scores + # are emitted (e.g. error path or evaluator that omits details). + run_scores.append(1.0 if run.final_eval_status == EvalStatus.PASSED else 0.0) + if not run_scores: + return 0.0 + return sum(run_scores) / len(run_scores) + + +def _format_tool_args(args: Any) -> str: + """Render a tool-call ``args`` dict inline as ``k=v, k=v``. + + Inline form keeps each tool call on one line; gepa's prompt_renderer + would otherwise expand each arg into its own ``###### key`` heading + and hit the H6 cap. + """ + if not isinstance(args, dict): + return repr(args) + parts: list[str] = [] + for key, value in args.items(): + if isinstance(value, str): + parts.append(f"{key}={value!r}") + elif isinstance(value, (int, float, bool)) or value is None: + parts.append(f"{key}={value}") + else: + parts.append(f"{key}={value!r}") + return ", ".join(parts) + + +def _format_tool_response(response: Any) -> str: + """Render a tool response inline; collapse single-key dicts to bare value.""" + if isinstance(response, dict): + if len(response) == 1: + value = next(iter(response.values())) + if isinstance(value, str): + return repr(value) + return str(value) + return "{" + _format_tool_args(response) + "}" + if isinstance(response, str): + return repr(response) + return str(response) + + +def _resolve_turn_metrics(run: EvalCaseResult, turn_idx: int, total_turns: int) -> list[EvalMetricResult]: + """Pick the verdict slice for one (run, turn). + + Multi-turn cases use ``eval_metric_result_per_invocation[i]. + eval_metric_results``. Single-turn cases sometimes leave that empty + and only populate ``overall_eval_metric_results`` — fall back so a + Turn 1 block still carries a verdict. + """ + per_inv = run.eval_metric_result_per_invocation or [] + if 0 <= turn_idx - 1 < len(per_inv): + pinv = per_inv[turn_idx - 1] + if pinv.eval_metric_results: + return list(pinv.eval_metric_results) + if total_turns == 1: + return list(run.overall_eval_metric_results or []) + return [] + + +def _build_turn_block( + case: EvalCase, + case_runs: Sequence[EvalCaseResult], + turn_idx: int, + total_turns: int, + is_multi_run: bool, +) -> str: + """Render one ``### Turn N`` block grouping user/expected/agent/tool/verdict. + + Conversational truth (User/Expected) is shared across runs and printed + first; for each run the actual agent_response, function-call trace, and + per-turn verdict follow. Multi-run cases nest each run under + ``#### Run N`` so the LM can attribute output variance to a specific + roll-out. + """ + lines: list[str] = [f"### Turn {turn_idx}"] + + convo = case.conversation or case.actual_conversation or [] + if 0 <= turn_idx - 1 < len(convo): + inv = convo[turn_idx - 1] + user_text = _invocation_text(inv, user=True) + if user_text: + lines.append(f"**User**: {user_text}") + expected_text = _invocation_text(inv, user=False) + if expected_text: + lines.append(f"**Expected**: {expected_text}") + + for ordinal, run in enumerate(case_runs, start=1): + run_id = getattr(run, "run_id", None) or ordinal + per_inv = run.eval_metric_result_per_invocation or [] + actual_inv: Optional[Invocation] = None + if 0 <= turn_idx - 1 < len(per_inv): + actual_inv = per_inv[turn_idx - 1].actual_invocation + + if is_multi_run: + lines.append("") + lines.append(f"#### Run {run_id}") + + if actual_inv is not None: + response_text = _invocation_text(actual_inv, user=False) + if response_text: + lines.append(f"**Agent Response**: {response_text}") + + tool_calls = get_all_tool_calls(actual_inv.intermediate_data) + tool_responses = get_all_tool_responses(actual_inv.intermediate_data) + if tool_calls or tool_responses: + lines.append("**Tool Trace**:") + resp_by_id: dict[str, Any] = {tr.id: tr for tr in tool_responses if tr.id} + consumed_ids: set[str] = set() + for tc in tool_calls: + args_inline = _format_tool_args(tc.args) if tc.args else "" + suffix = "" + if tc.id and tc.id in resp_by_id: + tr = resp_by_id[tc.id] + consumed_ids.add(tc.id) + suffix = f" → {_format_tool_response(tr.response)}" + id_tag = f" [id={tc.id}]" if tc.id else "" + lines.append(f"- {tc.name or ''}({args_inline}){suffix}{id_tag}") + # Surface tool_responses arriving without a paired call so + # the reflection LM doesn't miss out-of-band observations. + for tr in tool_responses: + if tr.id and tr.id in consumed_ids: + continue + id_tag = f" [id={tr.id}]" if tr.id else "" + lines.append(f"- (orphan response) {tr.name or ''} → " + f"{_format_tool_response(tr.response)}{id_tag}") + + verdict_metrics = _resolve_turn_metrics(run, turn_idx, total_turns) + if verdict_metrics: + run_tag = f", Run {run_id}" if is_multi_run else "" + lines.append(f"**Verdict** (Turn {turn_idx}{run_tag}):") + for verdict_line in _render_metric_lines(verdict_metrics): + lines.append(f" {verdict_line}") + + return "\n".join(lines) + + +def _build_overall_block(case_runs: Sequence[EvalCaseResult], is_multi_run: bool) -> str: + """Render the case-level aggregate verdict block. + + Single-run: ``### Overall (case-level aggregate)`` from the run's + ``overall_eval_metric_results``. Multi-run: ``### Overall (per-run + aggregate)`` with one sub-block per run, so the LM can spot which + runs failed without averaging through to a single mean. + """ + if is_multi_run: + lines: list[str] = ["### Overall (per-run aggregate)"] + for ordinal, run in enumerate(case_runs, start=1): + run_id = getattr(run, "run_id", None) or ordinal + lines.append(f"**Run {run_id}**:") + for verdict_line in _render_metric_lines(run.overall_eval_metric_results or []): + lines.append(f" {verdict_line}") + return "\n".join(lines) + + lines = ["### Overall (case-level aggregate)"] + if case_runs: + lines.extend(_render_metric_lines(case_runs[0].overall_eval_metric_results or [])) + return "\n".join(lines) + + +def _build_case_body(case: EvalCase, case_runs: Sequence[EvalCaseResult]) -> str: + """Build the per-turn-sliced markdown body of a failed case. + + Each turn is one ``### Turn N`` block bundling user / expected / + agent_response / Tool Trace / Verdict so each failing metric is + visually anchored to the turn that produced it. Multi-run cases nest + each run under ``#### Run N``. Multi-turn or multi-run cases close + with an ``### Overall`` aggregate. + + Returns an empty string when no usable turn data is available, so + the caller can decide whether to drop the record. + """ + if not case_runs: + return "" + + n_runs = len(case_runs) + is_multi_run = n_runs > 1 + + convo = case.conversation or case.actual_conversation or [] + if convo: + n_turns = len(convo) + else: + n_turns = max( + (len(run.eval_metric_result_per_invocation or []) for run in case_runs), + default=0, + ) + + if n_turns == 0: + return "" + + blocks: list[str] = [] + for turn_idx in range(1, n_turns + 1): + blocks.append(_build_turn_block(case, case_runs, turn_idx, n_turns, is_multi_run)) + + # Single-turn single-run cases skip the Overall block — Turn 1 + # already carries the only verdict that exists. + if n_turns > 1 or is_multi_run: + blocks.append(_build_overall_block(case_runs, is_multi_run)) + + return "\n\n".join(blocks) + + +def _build_other_active_components(candidate: dict[str, str], components_to_update: Sequence[str]) -> str: + """Render the prompt content of every candidate component NOT being + refined this round. + + GEPA fills ```` with only the prompt being rewritten, + but the evaluator's verdict was produced by the agent running with + ALL prompts. Surfacing the others as static context stops the LM + from regressing requirements already enforced elsewhere or + duplicating instructions. + + Returns an empty string when there is only one component or when + the others contain no text. + """ + targets = set(components_to_update) + others = {name: text for name, text in candidate.items() if name not in targets and text} + if not others: + return "" + lines: list[str] = [] + for name in sorted(others): + lines.append(f"### {name} (current)") + lines.append(others[name].rstrip()) + lines.append("") + return "\n".join(lines).rstrip() + + +def _build_trajectory_entry( + case: EvalCase, + score: float, + *, + case_runs: Sequence[EvalCaseResult] = (), + error_message: Optional[str] = None, +) -> dict[str, Any]: + """Bundle one case's evaluation artifacts for reflective dataset construction. + + ``score`` lets ``make_reflective_dataset`` filter to failed cases + without re-reading the runs. ``_case`` + ``_case_runs`` carry + everything the record builder needs to render the turn-sliced body. + On evaluator-error paths (no runs produced), ``error_message`` + surfaces a diagnostic in place of a Case Body. + """ + return { + "score": score, + "_case": case, + "_case_runs": list(case_runs), + "error_message": error_message, + } + + +def _make_return_type_checked_call_agent(call_agent: Any) -> Any: + """Wrap ``call_agent`` with a one-shot return-type check. + + Plain ``async def f(query): return 42`` passes + :func:`inspect.iscoroutinefunction`, so the broken return type is only + discovered when a metric tries to call ``.lower()`` / ``.strip()`` on the + int and produces an opaque ``AttributeError`` deep inside the metric path. + + The wrapper validates ``isinstance(result, str)`` on the first call only, + raising a clear ``TypeError`` that names the actual returned type. After + the first successful call subsequent invocations bypass the check, so the + overhead is a single boolean check on the first case and zero thereafter. + """ + checked = {"done": False} + + async def _checked(query: str) -> str: + result = await call_agent(query) + if not checked["done"]: + if not isinstance(result, str): + raise TypeError(f"call_agent must return str; got " + f"{type(result).__name__} (value={result!r}). " + f"This is checked once on the first invocation.") + checked["done"] = True + return result + + return _checked + + +class _AgentGEPAAdapter: + """GEPA protocol adapter bridging gepa.optimize() to the framework evaluator. + + Per ``evaluate`` call: + 1. Apply the proposed ``candidate`` to all registered ``TargetPrompt`` fields. + 2. Serialize ``batch`` and ``eval_config`` to a temp directory. + 3. Run ``run_evaluator`` (asyncio.run) and collect per-case pass + status + final response. + 4. Build an ``EvaluationBatch`` carrying scores, outputs, and + (optionally) trajectories used by reflective dataset construction. + + ``make_reflective_dataset`` then renders failed trajectories as + ``{component: [{case_id, score, "Case Body", "Other Active Components"?}, + ...]}`` for gepa's reflection prompt template. + """ + + # gepa's reflective proposer reads ``adapter.propose_new_texts`` + # directly; ``None`` signals "use gepa's default reflection LM path". + propose_new_texts = None + + def __init__( + self, + *, + target_prompt: TargetPrompt, + eval_config: EvalConfig, + call_agent: CallAgent, + callbacks: Optional[Callbacks] = None, + num_runs: int = 1, + case_parallelism: Optional[int] = None, + top_k_per_case: int = 2, + ) -> None: + self.target_prompt = target_prompt + self.eval_config = eval_config + # Wrap call_agent so the first call validates the return type and + # surfaces a clear TypeError on misuse (e.g. ``async def f(): return 42`` + # passes static signature checks but only blows up inside metrics). + # The check fires once; later calls bypass the wrapper. + self.call_agent = _make_return_type_checked_call_agent(call_agent) + self.callbacks = callbacks + self.num_runs = num_runs + self.case_parallelism = case_parallelism + self._top_k = max(0, int(top_k_per_case)) + self._best_history: dict[str, list[dict[str, Any]]] = {} + from ._optimize_evaluator_call import EvaluationOutcome # local to avoid cycle + self.last_outcome: Optional[EvaluationOutcome] = None + # Long-lived event loop reused across every evaluate() call so + # async resources held inside call_agent (httpx.AsyncClient, + # asyncpg pools, grpc.aio channels, ...) stay bound to a single + # loop. Created lazily on first evaluate() because adapter is + # constructed from an async context; allocating the loop here + # would not bind to the worker thread that gepa.optimize runs in. + self._loop: Optional[asyncio.AbstractEventLoop] = None + + def _get_or_create_loop(self) -> asyncio.AbstractEventLoop: + """Return the adapter-owned loop, creating it on first call. + + Must be invoked from the worker thread that drives gepa.optimize + (no running loop in that thread, so a fresh loop is safe). + """ + if self._loop is None or self._loop.is_closed(): + self._loop = asyncio.new_event_loop() + return self._loop + + def close(self) -> None: + """Close the adapter-owned loop. Idempotent; safe before first evaluate().""" + loop = getattr(self, "_loop", None) + self._loop = None + if loop is None or loop.is_closed(): + return + try: + loop.close() + except Exception: # pragma: no cover - defensive guard + pass + + def _record_history( + self, + *, + case_id: str, + score: float, + best_response: str, + ) -> None: + """Append one historical entry per case, keep at most top_k by score.""" + if self._top_k <= 0: + return + bucket = self._best_history.setdefault(case_id, []) + bucket.append({"score": float(score), "best_response": best_response}) + bucket.sort(key=lambda entry: entry["score"], reverse=True) + del bucket[self._top_k:] + + def evaluate( + self, + batch: list[EvalCase], + candidate: dict[str, str], + capture_traces: bool = False, + ) -> Any: + """Apply ``candidate`` and run the evaluator over ``batch``. + + Both the prompt write and the evaluator run execute on the + adapter-owned event loop so async resources held by call_agent + stay bound to a single loop across every gepa iteration. + """ + from gepa.core.adapter import EvaluationBatch + + loop = self._get_or_create_loop() + loop.run_until_complete(self.target_prompt.write_all(candidate)) + + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + evalset_path = tmp_path / "batch.evalset.json" + metrics_path = tmp_path / "batch.metrics.json" + + # Unique id per call so the in-memory eval-set manager doesn't + # reject repeated batches. gepa's batch sampler pads minibatches + # with least-frequent ids when trainset_size doesn't divide + # minibatch_size, so the same eval_case can appear twice — rename + # duplicate eval_ids in place so the manager accepts the EvalSet + # and every minibatch position still gets scored. + seen: dict[str, int] = {} + unique_cases: list[EvalCase] = [] + for case in batch: + seen[case.eval_id] = seen.get(case.eval_id, 0) + 1 + if seen[case.eval_id] == 1: + unique_cases.append(case) + else: + cloned = case.model_copy() + cloned.eval_id = f"{case.eval_id}__rep{seen[case.eval_id]}" + unique_cases.append(cloned) + evalset = EvalSet( + eval_set_id=f"optimize_gepa_batch_{uuid.uuid4().hex[:8]}", + eval_cases=unique_cases, + ) + evalset_path.write_text(evalset.model_dump_json(indent=2), encoding="utf-8") + metrics_path.write_text(self.eval_config.model_dump_json(indent=2), encoding="utf-8") + + outcome = loop.run_until_complete( + run_evaluator( + eval_dataset_path=str(evalset_path), + eval_metrics_path=str(metrics_path), + call_agent=self.call_agent, + callbacks=self.callbacks, + num_runs=self.num_runs, + case_parallelism=self.case_parallelism, + )) + self.last_outcome = outcome + + return self._build_evaluation_batch( + batch=unique_cases, + result=outcome.raw_result, + capture_traces=capture_traces, + evaluation_batch_cls=EvaluationBatch, + ) + + def make_reflective_dataset( + self, + candidate: dict[str, str], + eval_batch: Any, + components_to_update: list[str], + ) -> Mapping[str, Sequence[Mapping[str, Any]]]: + """Render failed-case trajectories into GEPA's reflective dataset shape. + + Each record is a turn-sliced dict tuned for multi-component / + multi-turn / multi-run / tool-using / multi-metric scenarios: + + - ``case_id``: stable identifier for cross-referencing. + - ``score``: aggregated case score in ``[0, 1]``; ``1.0`` = + every metric passed. + - ``Case Body``: turn-sliced markdown — see :func:`_build_case_body`. + - ``Other Active Components`` *(optional)*: current text of + every other prompt in the candidate. Present only when the + candidate has more than one component and the others + contain text. See :func:`_build_other_active_components`. + + Cases on the evaluator-error path (no runs produced) surface a + minimal record whose Case Body is the captured ``error_message``, + so the reflection LM still sees that the case failed. + """ + if not components_to_update: + return {} + + trajectories = getattr(eval_batch, "trajectories", None) + if not trajectories: + return {comp: [] for comp in components_to_update} + + # Per-component records: ``Other Active Components`` depends on + # which component is being rewritten this round, so rebuild it. + dataset: dict[str, list[Mapping[str, Any]]] = {} + for comp in components_to_update: + other_components_md = _build_other_active_components(candidate, [comp]) + records: list[Mapping[str, Any]] = [] + for traj in trajectories: + score = traj.get("score", 0.0) + if score >= 1.0: + continue + + case = traj.get("_case") + case_runs = traj.get("_case_runs") or [] + if not isinstance(case, EvalCase): + continue + + case_body = (_build_case_body(case, case_runs) if case_runs else "") + if not case_body: + # Evaluator-error path: fall back to the captured + # error_message so the LM still gets a diagnostic. + case_body = (traj.get("error_message") or "(no trajectory data captured)") + record: dict[str, Any] = { + "case_id": case.eval_id, + "score": float(score), + "Case Body": case_body, + } + history = self._best_history.get(case.eval_id, [])[:self._top_k] + if history: + record["history_top_k"] = history + if other_components_md: + record["Other Active Components"] = other_components_md + records.append(record) + dataset[comp] = records + return dataset + + def _build_evaluation_batch( + self, + *, + batch: list[EvalCase], + result: Optional[EvaluateResult], + capture_traces: bool, + evaluation_batch_cls: type, + ) -> Any: + scores: list[float] = [] + outputs: list[Any] = [] + trajectories: Optional[list[dict[str, Any]]] = [] if capture_traces else None + # Per-case per-metric scores. Dropped to ``None`` after the loop + # if no metric data was collected, so gepa's per-objective + # frontier stays inactive when the evaluator emits none. + objective_scores: list[dict[str, float]] = [] + + if result is None or not result.results_by_eval_set_id: + for case in batch: + scores.append(0.0) + outputs.append("") + objective_scores.append({}) + if trajectories is not None: + trajectories.append(_build_trajectory_entry(case, 0.0, error_message="no result returned")) + return evaluation_batch_cls( + outputs=outputs, + scores=scores, + trajectories=trajectories, + objective_scores=None, + ) + + set_result = next(iter(result.results_by_eval_set_id.values())) + + for case in batch: + case_runs = set_result.eval_results_by_eval_id.get(case.eval_id, []) + if not case_runs: + scores.append(0.0) + outputs.append("") + objective_scores.append({}) + if trajectories is not None: + trajectories.append( + _build_trajectory_entry( + case, + 0.0, + error_message="case missing from evaluator result", + )) + continue + + case_score = _continuous_case_score(case_runs) + scores.append(case_score) + objective_scores.append(_per_metric_objective_scores(case_runs)) + + first_run = case_runs[0] + outputs.append(_extract_case_output(first_run)) + + self._record_history( + case_id=case.eval_id, + score=case_score, + best_response=_extract_case_output(first_run), + ) + + if trajectories is not None: + trajectories.append(_build_trajectory_entry(case, case_score, case_runs=case_runs)) + + # Keep the field active when ANY case produced a non-empty metric map; + # GEPA treats ``None`` as "no per-objective data". + has_objective_data = any(scores_map for scores_map in objective_scores) + return evaluation_batch_cls( + outputs=outputs, + scores=scores, + trajectories=trajectories, + objective_scores=objective_scores if has_objective_data else None, + ) diff --git a/trpc_agent_sdk/evaluation/_optimize_gepa_callback.py b/trpc_agent_sdk/evaluation/_optimize_gepa_callback.py new file mode 100644 index 0000000..a7588ef --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_gepa_callback.py @@ -0,0 +1,381 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""GEPACallback adapter buffering real-time iteration events as RoundRecords. + +Implements ``gepa.core.callbacks.GEPACallback`` so the framework captures the +full reflective lifecycle for each iteration: + + * ``on_iteration_start`` — reset per-iteration buffer; snapshot the + reflection-LM counters so per-round deltas + are correct. + * ``on_minibatch_sampled`` — record train minibatch size for the round. + * ``on_proposal_end`` — capture which components the reflection LM + actually rewrote this round (gepa's + component selector, e.g. RoundRobin, may + mutate only a subset of the candidate's + components per round). + * ``on_evaluation_end`` — capture parent / candidate subsample scores + (the first two non-seed evaluations of an + iteration are parent + candidate on the + sampled minibatch). + * ``on_evaluation_skipped`` — capture the skip reason that prevented a + full validation evaluation (e.g. subsample + gate did not pass). + * ``on_valset_evaluated`` — capture the full validation pass rate, + metric breakdown and failed case ids; the + ``iteration == 0`` event is recorded as the + baseline instead of a round. + * ``on_merge_attempted`` — tag the current round as a ``"merge"`` round. + * ``on_budget_updated`` — track the gepa-reported ``metric_calls_used`` + counter so the reporter shows real budget + usage instead of a derived estimate. + * ``on_iteration_end`` — flush a complete RoundRecord (always, even + for rounds rejected at the subsample gate); + emit a RoundView for the attached reporter. +""" + +from __future__ import annotations + +from datetime import datetime +from datetime import timezone +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Mapping +from typing import Optional + +from ._optimize_result import RoundRecord + +if TYPE_CHECKING: + from ._optimize_reporter import OptimizeReporter + +# Translate gepa's skip reason literals into user-facing wording. +# Source: reference/gepa reflective_mutation.py:299, :320. +_GEPA_SKIP_REASON_MAP: dict[str, str] = { + "no_trajectories": "no trajectories captured this round", + "all_scores_perfect": "minibatch already perfect (skip_perfect_score on)", +} + +# Used when a round produced no candidate without emitting evaluation_skipped. +_NO_PROPOSAL_FALLBACK: str = "reflect-LM produced no usable new prompt" + + +def _translate_skip_reason(raw: Optional[str]) -> Optional[str]: + """Translate a gepa skip reason; unknown values surface under ``gepa-internal:``.""" + if raw is None: + return None + text = str(raw).strip() + if not text: + return None + if text in _GEPA_SKIP_REASON_MAP: + return _GEPA_SKIP_REASON_MAP[text] + normalised = text.lower().replace(" ", "_").replace("-", "_") + if normalised in _GEPA_SKIP_REASON_MAP: + return _GEPA_SKIP_REASON_MAP[normalised] + return f"gepa-internal: {text}" + + +class _AgentGEPACallback: + """Buffer per-iteration RoundRecords for GepaReflectiveOptimizer. + + Attributes: + rounds: list of RoundRecord populated during gepa.optimize() execution. + baseline_metric_breakdown: metric breakdown for the seed candidate + captured from the iteration-0 valset evaluation event. + baseline_failed_case_ids: failed case ids for the seed candidate. + baseline_pass_rate: average validation score for the seed candidate. + """ + + def __init__( + self, + *, + adapter: Any = None, + reflection_lm: Any = None, + reporter: Optional["OptimizeReporter"] = None, + train_size: int = 0, + budget_total: Optional[int] = None, + metric_thresholds: Optional[Mapping[str, float]] = None, + on_valset_breakdown: Optional[Callable[[dict[str, float]], None]] = None, + ) -> None: + self.rounds: list[RoundRecord] = [] + self.baseline_metric_breakdown: dict[str, float] = {} + self.baseline_failed_case_ids: list[str] = [] + self.baseline_pass_rate: float = 0.0 + self._adapter = adapter + self._reflection_lm = reflection_lm + self._reporter = reporter + self._train_size = int(train_size) + self._budget_total = budget_total + self._metric_thresholds = dict(metric_thresholds or {}) + self._on_valset_breakdown = on_valset_breakdown + self._budget_used: int = 0 + self._reset_iter_buffer() + self._calls_at_iter_start: int = 0 + self._cost_at_iter_start: float = 0.0 + self._tokens_at_iter_start: dict[str, int] = { + "prompt": 0, + "completion": 0, + "total": 0, + } + + def _reset_iter_buffer(self) -> None: + self._iter_started_at: Optional[datetime] = None + self._iter_iteration: int = 0 + self._iter_candidate: Optional[dict[str, str]] = None + self._iter_val_score: Optional[float] = None + self._iter_is_best: bool = False + self._iter_metric_breakdown: dict[str, float] = {} + self._iter_failed_case_ids: list[str] = [] + self._iter_train_minibatch_size: int = 0 + self._iter_train_size: int = self._train_size + self._iter_train_parent_score: Optional[float] = None + self._iter_train_candidate_score: Optional[float] = None + self._iter_skip_reason: Optional[str] = None + self._iter_error_message: Optional[str] = None + self._iter_kind: str = "reflective" + # Components rewritten this round (set by on_proposal_end). None + # means no proposal event observed for the iteration. + self._iter_changed_components: Optional[list[str]] = None + + def on_iteration_start(self, event: Mapping[str, Any]) -> None: + self._reset_iter_buffer() + self._iter_started_at = datetime.now(timezone.utc) + self._iter_iteration = int(event.get("iteration", 0)) + if self._reflection_lm is not None: + self._calls_at_iter_start = int(getattr(self._reflection_lm, "total_calls", 0)) + self._cost_at_iter_start = float(getattr(self._reflection_lm, "total_cost", 0.0)) + usage = getattr(self._reflection_lm, "total_token_usage", None) or {} + self._tokens_at_iter_start = { + "prompt": int(usage.get("prompt", 0)), + "completion": int(usage.get("completion", 0)), + "total": int(usage.get("total", 0)), + } + + def on_minibatch_sampled(self, event: Mapping[str, Any]) -> None: + minibatch_ids = event.get("minibatch_ids") or [] + self._iter_train_minibatch_size = len(minibatch_ids) + trainset_size = event.get("trainset_size") + if isinstance(trainset_size, int) and trainset_size > 0: + self._iter_train_size = trainset_size + + def on_proposal_end(self, event: Mapping[str, Any]) -> None: + """Capture which components the reflection LM rewrote this round. + + gepa's component selector (e.g. ``RoundRobinReflectionComponentSelector``) + chooses a subset of the candidate's components per round; only + components that produced a non-empty new instruction land in + ``new_instructions``, making it the authoritative source for the + ``optimized_field_names`` field on the buffered RoundRecord. Code + paths that bypass this event (e.g. merge rounds) leave the + marker ``None`` so ``on_iteration_end`` falls back to + ``candidate.keys()``. + """ + new_instructions = event.get("new_instructions") + if isinstance(new_instructions, Mapping): + self._iter_changed_components = list(new_instructions.keys()) + + def on_evaluation_end(self, event: Mapping[str, Any]) -> None: + """Record subsample scores for the parent and the new candidate. + + gepa marks the post-mutation / post-merge evaluation with + ``candidate_idx=None`` (reflective_mutation.py:430 emits None for + the new-candidate eval; merge.py:376 also uses None for the + post-merge eval). Every other evaluation_end carries an int + ``candidate_idx`` and represents the parent / current-program + eval. Routing on this field is more reliable than counting + event order — earlier seq-based logic misclassified rounds + where the reflective proposer picked the seed program (id=0) + as parent, because gepa flags that parent eval with + ``is_seed_candidate=True`` and the previous early-return + dropped the parent score, shifting the candidate score into + the parent slot. + """ + scores = event.get("scores") or [] + if not scores: + return + avg = sum(float(s) for s in scores) / max(1, len(scores)) + if event.get("candidate_idx") is None: + # New candidate evaluation (reflective post-mutation OR + # merge post-merge). + self._iter_train_candidate_score = avg + else: + # Parent / current-program evaluation. + self._iter_train_parent_score = avg + if not self._iter_train_minibatch_size: + self._iter_train_minibatch_size = len(scores) + + def on_evaluation_skipped(self, event: Mapping[str, Any]) -> None: + translated = _translate_skip_reason(event.get("reason")) + if translated: + self._iter_skip_reason = translated + + def on_merge_attempted(self, event: Mapping[str, Any]) -> None: + self._iter_kind = "merge" + + def on_budget_updated(self, event: Mapping[str, Any]) -> None: + used = event.get("metric_calls_used") + if isinstance(used, int): + self._budget_used = used + + def on_error(self, event: Mapping[str, Any]) -> None: + exc = event.get("exception") + if exc is not None: + self._iter_error_message = str(exc) + + def on_valset_evaluated(self, event: Mapping[str, Any]) -> None: + candidate = event.get("candidate") + if candidate is None: + return + # adapter.last_outcome was set immediately before gepa emits this + # event, so the breakdown / failures correspond to ``candidate``. + outcome = getattr(self._adapter, "last_outcome", None) if self._adapter else None + metric_breakdown: dict[str, float] = {} + failed_case_ids: list[str] = [] + if outcome is not None: + metric_breakdown = dict(getattr(outcome, "metric_breakdown", {})) + failed_case_ids = list(getattr(outcome, "failed_case_ids", [])) + + if self._on_valset_breakdown is not None: + try: + self._on_valset_breakdown(dict(metric_breakdown)) + except Exception: # pragma: no cover - never break loop on stopper error + pass + + if int(event.get("iteration", -1)) == 0: + self.baseline_metric_breakdown = metric_breakdown + self.baseline_failed_case_ids = failed_case_ids + self.baseline_pass_rate = float(event.get("average_score", 0.0)) + if self._reporter is not None: + try: + self._reporter.baseline_evaluated( + self.baseline_pass_rate, + dict(self.baseline_metric_breakdown), + metric_thresholds=dict(self._metric_thresholds), + ) + except Exception: # pragma: no cover - never break loop on reporter error + pass + return + + self._iter_candidate = dict(candidate) + self._iter_val_score = float(event.get("average_score", 0.0)) + self._iter_is_best = bool(event.get("is_best_program", False)) + self._iter_metric_breakdown = metric_breakdown + self._iter_failed_case_ids = failed_case_ids + + def on_iteration_end(self, event: Mapping[str, Any]) -> None: + """Flush a RoundRecord for the iteration regardless of acceptance. + + Iterations rejected at the subsample gate (``_iter_candidate`` stays + None) are still recorded so the reporter timeline matches gepa's + actual progression and round indices stay contiguous. + """ + iteration = int(event.get("iteration", self._iter_iteration)) + started_at = self._iter_started_at or datetime.now(timezone.utc) + finished_at = datetime.now(timezone.utc) + duration = max(0.0, (finished_at - started_at).total_seconds()) + proposal_accepted = bool(event.get("proposal_accepted", False)) + candidate_seen = self._iter_candidate is not None + accepted = proposal_accepted and candidate_seen + + if self._iter_error_message: + reason = f"error: {self._iter_error_message}" + elif self._iter_skip_reason: + reason = f"skipped: {self._iter_skip_reason}" + elif candidate_seen: + score = self._iter_val_score or 0.0 + reason = (f"GEPA accepted proposal (val_score={score:.4f})" + if accepted else f"Explored by GEPA (val_score={score:.4f})") + else: + reason = "no candidate produced this round" + + reflection_calls_delta = 0 + round_llm_cost = 0.0 + round_token_usage = {"prompt": 0, "completion": 0, "total": 0} + if self._reflection_lm is not None: + reflection_calls_delta = max( + 0, + int(getattr(self._reflection_lm, "total_calls", 0)) - self._calls_at_iter_start, + ) + round_llm_cost = max( + 0.0, + float(getattr(self._reflection_lm, "total_cost", 0.0)) - self._cost_at_iter_start, + ) + cur = getattr(self._reflection_lm, "total_token_usage", None) or {} + for key in ("prompt", "completion", "total"): + round_token_usage[key] = max( + 0, + int(cur.get(key, 0)) - self._tokens_at_iter_start.get(key, 0), + ) + + validation_pass_rate = (self._iter_val_score if self._iter_val_score is not None else 0.0) + candidate_prompts = (dict(self._iter_candidate) if candidate_seen else {}) + # Authoritative source: components captured from on_proposal_end. + # Fallback to full candidate keys for rounds without a proposal + # event (e.g. merge rounds — "rewrite" doesn't apply, listing all + # keys is the least misleading default). + if self._iter_changed_components is not None: + optimized_field_names = list(self._iter_changed_components) + elif candidate_seen: + optimized_field_names = list(self._iter_candidate.keys()) + else: + optimized_field_names = [] + + skip_reason = self._iter_skip_reason + if (not candidate_seen and skip_reason is None and self._iter_error_message is None): + skip_reason = _NO_PROPOSAL_FALLBACK + + record = RoundRecord( + round=iteration, + optimized_field_names=optimized_field_names, + candidate_prompts=candidate_prompts, + train_pass_rate=0.0, + validation_pass_rate=validation_pass_rate, + metric_breakdown=dict(self._iter_metric_breakdown), + accepted=accepted, + acceptance_reason=reason, + failed_case_ids=list(self._iter_failed_case_ids), + reflection_lm_calls=reflection_calls_delta, + round_llm_cost=round_llm_cost, + round_token_usage=round_token_usage, + started_at=started_at.isoformat(), + duration_seconds=duration, + kind=self._iter_kind if self._iter_kind in ("reflective", "merge") else "reflective", + train_minibatch_size=self._iter_train_minibatch_size, + train_subsample_parent_score=self._iter_train_parent_score, + train_subsample_candidate_score=self._iter_train_candidate_score, + skip_reason=skip_reason, + error_message=self._iter_error_message, + budget_used=self._budget_used if self._budget_used else None, + budget_total=self._budget_total, + ) + self.rounds.append(record) + + if self._reporter is not None: + try: + self._emit_round_completed(record) + except Exception: # pragma: no cover - never break loop on reporter error + pass + + def _emit_round_completed(self, record: RoundRecord) -> None: + """Translate a freshly buffered RoundRecord into a RoundView event.""" + from ._optimize_reporter import RoundView + + view = RoundView( + round=record.round, + kind=record.kind, + train_minibatch_size=record.train_minibatch_size, + train_size=self._iter_train_size or self._train_size, + train_subsample_parent_score=record.train_subsample_parent_score, + train_subsample_candidate_score=record.train_subsample_candidate_score, + val_pass_rate=(record.validation_pass_rate if record.candidate_prompts else None), + accepted=record.accepted, + skip_reason=record.skip_reason, + error_message=record.error_message, + duration_seconds=record.duration_seconds, + budget_used=record.budget_used, + budget_total=record.budget_total, + ) + self._reporter.round_completed(view) diff --git a/trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py b/trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py new file mode 100644 index 0000000..322340e --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_gepa_reflective.py @@ -0,0 +1,612 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""GEPA reflective optimizer: BaseOptimizer subclass driving ``gepa.optimize()``. + +Hosts the ``gepa_reflective`` algorithm and its registry entry. The GEPA +protocol adapter and trajectory helpers live in +:mod:`_optimize_gepa_adapter`; the reflection-LM wrapper lives in +:mod:`_optimize_model_callable`. + +``gepa`` is an optional dependency: ``gepa.optimize`` and the stopper +classes are imported lazily inside :meth:`GepaReflectiveOptimizer._call_gepa_optimize` +and :meth:`GepaReflectiveOptimizer._build_stop_callbacks`, so importing +this module without ``gepa`` installed succeeds but ``run()`` then fails +fast with an informative ImportError. +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime +from datetime import timezone +from pathlib import Path +from typing import Any +from typing import Optional + +from ._base_optimizer import BaseOptimizer +from ._eval_case import EvalCase +from ._eval_config import EvalConfig +from ._eval_set import EvalSet +from ._optimize_config import FrameworkStopConfig +from ._optimize_config import GepaReflectiveAlgo +from ._optimize_gepa_adapter import _AgentGEPAAdapter +from ._optimize_gepa_callback import _AgentGEPACallback +from ._optimize_metric_info import build_metric_reference_doc +from ._optimize_metric_info import build_reflection_prompt_template +from ._optimize_model_callable import _OptimizeModelCallable +from ._optimize_reporter import OptimizeReporter +from ._optimize_reporter import _SilentGepaLogger +from ._optimize_result import OptimizeResult +from ._optimize_result import RoundRecord +from ._optimize_result import StopReason + + +def _load_evalset_cases(path: str) -> list[EvalCase]: + """Read an EvalSet JSON file and return its eval_cases list. + + Raises: + FileNotFoundError: if path does not exist. + pydantic.ValidationError: on schema violations. + """ + content = Path(path).read_text(encoding="utf-8") + evalset = EvalSet.model_validate_json(content) + return list(evalset.eval_cases) + + +def _collect_metric_thresholds(eval_config: EvalConfig) -> dict[str, float]: + """Return ``{metric_name: threshold}`` for every metric in the evaluator config. + + Mirrors what the local evaluator and per-metric evaluators consume so the + reporter and the persisted result share one source of truth for thresholds. + """ + return {metric.metric_name: float(metric.threshold) for metric in eval_config.get_eval_metrics()} + + +class _LabeledStopper: + """Wrap a gepa StopperProtocol with a stable :data:`StopReason` label. + + Delegates ``__call__`` to the inner stopper and exposes a sticky + ``last_triggered`` flag set the first time the inner stopper returns + ``True``. ``_classify_stop_reason`` reads the label after gepa + returns to map back to a single ``stop_reason`` enum value. + """ + + def __init__(self, inner: Any, label: StopReason) -> None: + self._inner = inner + self.label: StopReason = label + self.last_triggered: bool = False + + def __call__(self, *args: Any, **kwargs: Any) -> bool: + result = bool(self._inner(*args, **kwargs)) + if result: + self.last_triggered = True + return result + + +class _RequiredMetricsAboveThresholdStopper: + """gepa Stopper that fires once every required metric meets its threshold. + + Backs the framework-level ``stop.required_metrics`` policy. Each + iteration's per-metric breakdown is pushed via ``update`` (called by + ``_AgentGEPACallback.on_valset_breakdown``); ``__call__`` returns + True as soon as that breakdown clears every threshold, halting the + run with ``stop_reason="required_metrics_passing"``. + + Attributes: + last_triggered: Sticky flag set the first time ``__call__`` + returned True. + """ + + def __init__(self, required_thresholds: dict[str, float]) -> None: + self._thresholds: dict[str, float] = dict(required_thresholds) + self._latest: dict[str, float] = {} + self.last_triggered: bool = False + + def update(self, breakdown: dict[str, float]) -> None: + """Record the most recent per-metric breakdown observed on the valset.""" + self._latest = dict(breakdown) + + def __call__(self, gepa_state: Any = None) -> bool: + triggered = BaseOptimizer.metrics_meet_thresholds(self._latest, self._thresholds) + if triggered: + self.last_triggered = True + return triggered + + +def _build_optimize_result( + *, + gepa_result: Any, + baseline_prompts: dict[str, str], + best_candidate: dict[str, str], + reflection_lm_cost: float, + started_at: datetime, + finished_at: datetime, + algo_name: str, + finish_reason: str = "completed", + callback_rounds: Optional[list[RoundRecord]] = None, + baseline_metric_breakdown: Optional[dict[str, float]] = None, + metric_thresholds: Optional[dict[str, float]] = None, + stop_reason: Optional[StopReason] = None, + total_reflection_lm_calls: int = 0, + total_judge_model_calls: int = 0, + total_judge_cost: float = 0.0, + total_token_usage: Optional[dict[str, int]] = None, +) -> OptimizeResult: + """Map a successful GEPAResult into the framework's OptimizeResult schema. + + Round source priority: + 1. ``callback_rounds`` — real-time RoundRecord buffer from + :class:`_AgentGEPACallback` (used in production whenever gepa + emits iteration events). + 2. Post-hoc reconstruction from ``gepa_result.candidates`` / + ``val_aggregate_scores`` — fallback for callers that don't + install the callback (e.g. mock-driven unit tests, older gepa + versions). + + Args: + baseline_metric_breakdown: Per-metric mean for the baseline + candidate, captured by callback at iteration 0. + total_reflection_lm_calls: Reflection LM invocation count. + total_judge_model_calls: Evaluator-internal judge LM count. + total_judge_cost: USD cost charged to the judge LM (added to + reflection-LM cost). + total_token_usage: ``{"prompt", "completion", "total"}`` for the + reflection LM, optionally merged with judge token usage. + """ + val_scores = list(gepa_result.val_aggregate_scores) + baseline_pass_rate = float(val_scores[0]) if val_scores else 0.0 + best_idx = int(gepa_result.best_idx) + best_pass_rate = float(val_scores[best_idx]) if val_scores else 0.0 + + started_iso = started_at.isoformat() + if callback_rounds: + rounds = list(callback_rounds) + else: + # Fallback path: no callback event stream available. gepa_result + # alone doesn't carry per-round mutation metadata, so fields + # below use the most-conservative approximation: + # * optimized_field_names: all candidate keys (no signal for + # which subset the reflection LM actually rewrote — the + # callback path narrows this via on_proposal_end). + # * accepted: equated with is_best, since GEPAResult only + # reports the final winner, not per-round acceptance. + candidates = list(gepa_result.candidates) + rounds = [] + for i in range(1, len(candidates)): + candidate = dict(candidates[i]) + score = float(val_scores[i]) if i < len(val_scores) else 0.0 + is_best = i == best_idx + rounds.append( + RoundRecord( + round=i, + optimized_field_names=list(candidate.keys()), + candidate_prompts=candidate, + train_pass_rate=0.0, + validation_pass_rate=score, + accepted=is_best, + acceptance_reason=(f"Selected as best by GEPA (val_score={score:.4f})" + if is_best else f"Explored by GEPA (val_score={score:.4f})"), + started_at=started_iso, + duration_seconds=0.0, + )) + + best_metric_breakdown: dict[str, float] = {} + for record in rounds: + if record.candidate_prompts == best_candidate and record.metric_breakdown: + best_metric_breakdown = dict(record.metric_breakdown) + break + + # When gepa finds no improvement (best_idx == 0), best_candidate equals + # the seed prompts and the loop above never matches — iteration 0 is + # captured as ``baseline_metric_breakdown`` rather than a RoundRecord. + # Mirror baseline data into ``best`` so summary.txt shows + # ``baseline -> baseline`` (no improvement) instead of + # ``baseline -> nan`` (looks like data loss). + if (not best_metric_breakdown and best_candidate == baseline_prompts and baseline_metric_breakdown): + best_metric_breakdown = dict(baseline_metric_breakdown) + + extras: dict[str, Any] = {} + total_metric_calls = getattr(gepa_result, "total_metric_calls", None) + if total_metric_calls is not None: + extras["total_metric_calls"] = int(total_metric_calls) + + duration_seconds = max(0.0, (finished_at - started_at).total_seconds()) + token_usage = dict(total_token_usage) if total_token_usage else { + "prompt": 0, + "completion": 0, + "total": 0, + } + + # GEPA's per_objective_best_candidates is dict[str, set[int]] | None; + # convert to dict[str, list[int]] (sorted) for stable JSON output. + raw_per_metric_best = getattr(gepa_result, "per_objective_best_candidates", None) + per_metric_best: dict[str, list[int]] = {} + if isinstance(raw_per_metric_best, dict): + for metric_name, indices in raw_per_metric_best.items(): + try: + per_metric_best[str(metric_name)] = sorted(int(i) for i in indices) + except (TypeError, ValueError): + continue + + return OptimizeResult( + algorithm=algo_name, + status="SUCCEEDED", + finish_reason=finish_reason, + stop_reason=stop_reason, + baseline_pass_rate=baseline_pass_rate, + best_pass_rate=best_pass_rate, + pass_rate_improvement=best_pass_rate - baseline_pass_rate, + baseline_metric_breakdown=dict(baseline_metric_breakdown or {}), + best_metric_breakdown=best_metric_breakdown, + metric_thresholds=dict(metric_thresholds or {}), + per_metric_best_candidates=per_metric_best, + baseline_prompts=dict(baseline_prompts), + best_prompts=dict(best_candidate), + total_rounds=len(rounds), + rounds=rounds, + total_reflection_lm_calls=int(total_reflection_lm_calls), + total_judge_model_calls=int(total_judge_model_calls), + total_llm_cost=float(reflection_lm_cost) + float(total_judge_cost), + total_token_usage=token_usage, + duration_seconds=duration_seconds, + started_at=started_iso, + finished_at=finished_at.isoformat(), + extras=extras, + ) + + +def _build_failed_result( + *, + baseline_prompts: dict[str, str], + started_at: datetime, + finished_at: datetime, + error_message: str, + algo_name: str, + metric_thresholds: Optional[dict[str, float]] = None, +) -> OptimizeResult: + """Build a FAILED OptimizeResult preserving the baseline as the best prompts.""" + return OptimizeResult( + algorithm=algo_name, + status="FAILED", + finish_reason="error", + error_message=error_message, + baseline_pass_rate=0.0, + best_pass_rate=0.0, + pass_rate_improvement=0.0, + metric_thresholds=dict(metric_thresholds or {}), + baseline_prompts=dict(baseline_prompts), + best_prompts=dict(baseline_prompts), + total_rounds=0, + rounds=[], + total_reflection_lm_calls=0, + total_judge_model_calls=0, + total_llm_cost=0.0, + duration_seconds=max(0.0, (finished_at - started_at).total_seconds()), + started_at=started_at.isoformat(), + finished_at=finished_at.isoformat(), + extras={}, + ) + + +def _build_stop_callbacks( + algo: GepaReflectiveAlgo, + stop_config: FrameworkStopConfig, + metric_thresholds: dict[str, float], + *, + output_dir: Optional[str] = None, +) -> tuple[list[Any], Optional[_RequiredMetricsAboveThresholdStopper]]: + """Translate stop fields into gepa StopperProtocol instances. + + Each non-None ``algo`` field maps to one gepa-native stopper + (max_metric_calls, no_improvement, timeout, score_threshold, + max_candidate_proposals, max_tracked_candidates). + + The framework-level :class:`FrameworkStopConfig` adds the + metric-thresholds policy via + :class:`_RequiredMetricsAboveThresholdStopper` when + ``stop_config.required_metrics`` resolves to a non-empty subset of + ``metric_thresholds``. That instance is also returned so the caller + can inspect ``last_triggered`` for stop-reason classification. + + When ``output_dir`` is supplied, a :class:`gepa.utils.FileStopper` + watches ``/optimize.stop``: creating that file (e.g. + ``touch $OUTPUT_DIR/optimize.stop``) halts gepa cleanly at the next + poll and surfaces as ``stop_reason="user_requested_stop"``. + + Returns: + ``(stop_callbacks, framework_stopper)`` — ``framework_stopper`` + is ``None`` when no per-metric thresholds are enforced. + """ + from gepa.utils.stop_condition import MaxCandidateProposalsStopper + from gepa.utils.stop_condition import MaxMetricCallsStopper + from gepa.utils.stop_condition import MaxTrackedCandidatesStopper + from gepa.utils.stop_condition import NoImprovementStopper + from gepa.utils.stop_condition import ScoreThresholdStopper + from gepa.utils.stop_condition import TimeoutStopCondition + + callbacks: list[Any] = [] + if algo.max_metric_calls is not None: + callbacks.append(_LabeledStopper( + MaxMetricCallsStopper(int(algo.max_metric_calls)), + "budget_exhausted", + )) + if algo.max_iterations_without_improvement is not None: + callbacks.append( + _LabeledStopper( + NoImprovementStopper(int(algo.max_iterations_without_improvement)), + "no_improvement", + )) + if algo.timeout_seconds is not None: + callbacks.append(_LabeledStopper( + TimeoutStopCondition(float(algo.timeout_seconds)), + "timeout", + )) + if algo.score_threshold is not None: + callbacks.append(_LabeledStopper( + ScoreThresholdStopper(float(algo.score_threshold)), + "score_threshold", + )) + if algo.max_candidate_proposals is not None: + callbacks.append( + _LabeledStopper( + MaxCandidateProposalsStopper(int(algo.max_candidate_proposals)), + "max_candidate_proposals", + )) + if algo.max_tracked_candidates is not None: + callbacks.append( + _LabeledStopper( + MaxTrackedCandidatesStopper(int(algo.max_tracked_candidates)), + "max_tracked_candidates", + )) + + framework_stopper: Optional[_RequiredMetricsAboveThresholdStopper] = None + required = BaseOptimizer.resolve_required_thresholds(stop_config, metric_thresholds) + if required: + framework_stopper = _RequiredMetricsAboveThresholdStopper(required) + callbacks.append(framework_stopper) + + if output_dir is not None: + import os as _os + from gepa.utils import FileStopper + + callbacks.append( + _LabeledStopper( + FileStopper(_os.path.join(output_dir, "optimize.stop")), + "user_requested_stop", + )) + + return callbacks, framework_stopper + + +def _classify_stop_reason( + *, + stop_callbacks: list[Any], + framework_stopper: Optional[_RequiredMetricsAboveThresholdStopper], +) -> StopReason: + """Pick the most-specific :data:`StopReason` for an ended gepa run. + + Resolution order: + 1. Framework-level ``required_metrics`` policy (highest priority + because users explicitly opt in). + 2. First :class:`_LabeledStopper` whose ``last_triggered`` is True + (insertion order breaks ties when gepa polled multiple stoppers + in the same tick). + 3. ``"completed"`` when no stopper fired (gepa loop ended + naturally, e.g. exhausted candidate proposals). + """ + if framework_stopper is not None and framework_stopper.last_triggered: + return "required_metrics_passing" + for stopper in stop_callbacks: + if isinstance(stopper, _LabeledStopper) and stopper.last_triggered: + return stopper.label + return "completed" + + +class GepaReflectiveOptimizer(BaseOptimizer): + """BaseOptimizer driving ``gepa.optimize()`` with the framework adapter. + + Flow inside :meth:`run`: + 1. Snapshot baseline prompts via ``TargetPrompt.read_all``. + 2. Load training / validation eval cases. + 3. Build :class:`_AgentGEPAAdapter` and + :class:`_OptimizeModelCallable` (gepa-compatible reflection LM). + 4. Run ``gepa.optimize`` in a worker thread (``asyncio.to_thread``) + so its sync main loop does not block the surrounding event loop. + 5. On success, return a populated :class:`OptimizeResult`; on + failure, return a FAILED result preserving the baseline prompts. + + The facade (``AgentOptimizer.optimize``) decides whether to persist + the winning candidate based on the ``update_source`` flag. + """ + + async def _call_gepa_optimize(self, **kwargs: Any) -> Any: + """Run gepa.optimize in a thread; isolated for tests to monkeypatch.""" + from gepa import optimize as gepa_optimize # lazy import; gepa is optional + + return await asyncio.to_thread(gepa_optimize, **kwargs) + + async def run( + self, + *, + reporter: Optional[OptimizeReporter] = None, + ) -> OptimizeResult: + algo: GepaReflectiveAlgo = self.config.optimize.algorithm + algo_name = algo.name + metric_thresholds = _collect_metric_thresholds(self.config.evaluate) + + started_at = datetime.now(timezone.utc) + baseline_prompts = await self.target_prompt.read_all() + seed_candidate = dict(baseline_prompts) + + try: + trainset = _load_evalset_cases(self.train_dataset_path) + valset = _load_evalset_cases(self.validation_dataset_path) + except Exception as ex: + return _build_failed_result( + baseline_prompts=baseline_prompts, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + error_message=f"dataset load failed: {ex}", + algo_name=algo_name, + metric_thresholds=metric_thresholds, + ) + + adapter = _AgentGEPAAdapter( + target_prompt=self.target_prompt, + eval_config=self.config.evaluate, + call_agent=self.call_agent, + callbacks=self.callbacks, + num_runs=self.config.evaluate.num_runs, + case_parallelism=self.config.optimize.eval_case_parallelism, + top_k_per_case=int(algo.reflection_history_top_k), + ) + reflection_lm = _OptimizeModelCallable(algo.reflection_lm) + + try: + return await self._run_with_adapter( + adapter=adapter, + reflection_lm=reflection_lm, + algo=algo, + algo_name=algo_name, + baseline_prompts=baseline_prompts, + seed_candidate=seed_candidate, + trainset=trainset, + valset=valset, + metric_thresholds=metric_thresholds, + started_at=started_at, + reporter=reporter, + ) + finally: + adapter.close() + + async def _run_with_adapter( + self, + *, + adapter: _AgentGEPAAdapter, + reflection_lm: _OptimizeModelCallable, + algo: GepaReflectiveAlgo, + algo_name: str, + baseline_prompts: dict[str, str], + seed_candidate: dict[str, str], + trainset: list, + valset: list, + metric_thresholds: dict[str, float], + started_at: datetime, + reporter: Optional[OptimizeReporter], + ) -> OptimizeResult: + try: + stop_callbacks, framework_stopper = _build_stop_callbacks( + algo, + self.config.optimize.stop, + metric_thresholds, + output_dir=self.output_dir, + ) + except ImportError as ex: + return _build_failed_result( + baseline_prompts=baseline_prompts, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + error_message=f"gepa stop_callbacks unavailable: {ex}", + algo_name=algo_name, + metric_thresholds=metric_thresholds, + ) + + gepa_callback = _AgentGEPACallback( + adapter=adapter, + reflection_lm=reflection_lm, + reporter=reporter, + train_size=len(trainset), + budget_total=algo.max_metric_calls, + metric_thresholds=metric_thresholds, + on_valset_breakdown=(framework_stopper.update if framework_stopper is not None else None), + ) + + # Embed a metric reference doc in the reflection prompt template so + # the reflection LM understands each feedback row. Empty doc still + # yields a GEPA-valid template. + reflection_prompt_template = build_reflection_prompt_template(build_metric_reference_doc(self.config.evaluate)) + + gepa_kwargs: dict[str, Any] = dict( + seed_candidate=seed_candidate, + trainset=trainset, + valset=valset, + adapter=adapter, + reflection_lm=reflection_lm, + reflection_prompt_template=reflection_prompt_template, + callbacks=[gepa_callback, *self.extra_gepa_callbacks], + candidate_selection_strategy=algo.candidate_selection_strategy, + module_selector=algo.module_selector, + reflection_minibatch_size=algo.reflection_minibatch_size, + skip_perfect_score=algo.skip_perfect_score, + perfect_score=algo.perfect_score, + use_merge=algo.use_merge, + max_merge_invocations=algo.max_merge_invocations, + merge_val_overlap_floor=algo.merge_val_overlap_floor, + frontier_type=algo.frontier_type, + cache_evaluation=algo.cache_evaluation, + track_best_outputs=algo.track_best_outputs, + raise_on_exception=True, + seed=algo.seed, + display_progress_bar=False, + stop_callbacks=[*stop_callbacks, *self.extra_stop_callbacks], + ) + # ``max_metric_calls`` is also a direct kwarg for backwards + # compatibility with gepa builds lacking ``MaxMetricCallsStopper``. + if algo.max_metric_calls is not None: + gepa_kwargs["max_metric_calls"] = int(algo.max_metric_calls) + + # Silence gepa's stdout logger when a reporter is attached so its + # internal messages don't collide with the reporter timeline. + if reporter is not None: + gepa_kwargs["logger"] = _SilentGepaLogger(verbose=1) + try: + gepa_result = await self._call_gepa_optimize(**gepa_kwargs) + except Exception as ex: + return _build_failed_result( + baseline_prompts=baseline_prompts, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + error_message=str(ex), + algo_name=algo_name, + metric_thresholds=metric_thresholds, + ) + + best_idx = int(gepa_result.best_idx) + best_candidate = dict(gepa_result.candidates[best_idx]) + + val_scores = list(gepa_result.val_aggregate_scores) + baseline_pass_rate = float(val_scores[0]) if val_scores else 0.0 + best_pass_rate = float(val_scores[best_idx]) if val_scores else 0.0 + if best_pass_rate >= 1.0 and baseline_pass_rate >= 1.0: + finish_reason = "perfect_pass_rate" + elif best_pass_rate <= baseline_pass_rate: + finish_reason = "no_improvement" + else: + finish_reason = "completed" + + stop_reason: StopReason = _classify_stop_reason( + stop_callbacks=stop_callbacks, + framework_stopper=framework_stopper, + ) + + return _build_optimize_result( + gepa_result=gepa_result, + baseline_prompts=baseline_prompts, + best_candidate=best_candidate, + reflection_lm_cost=reflection_lm.total_cost, + callback_rounds=gepa_callback.rounds, + started_at=started_at, + finished_at=datetime.now(timezone.utc), + algo_name=algo_name, + finish_reason=finish_reason, + baseline_metric_breakdown=dict(gepa_callback.baseline_metric_breakdown), + metric_thresholds=metric_thresholds, + stop_reason=stop_reason, + total_reflection_lm_calls=int(reflection_lm.total_calls), + total_judge_model_calls=0, + total_judge_cost=0.0, + total_token_usage=dict(reflection_lm.total_token_usage), + ) diff --git a/trpc_agent_sdk/evaluation/_optimize_metric_info.py b/trpc_agent_sdk/evaluation/_optimize_metric_info.py new file mode 100644 index 0000000..70c6f2b --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_metric_info.py @@ -0,0 +1,534 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Metric reference doc builder for the optimize module. + +Renders a structured markdown "syllabus" describing how each +user-configured metric is computed, for injection into the reflection +LM's prompt template alongside the per-case feedback. + +The corpus is owned here (not delegated to each evaluator's +``get_metric_info()``) so wording can be tuned for what a rewriting LM +needs. + +Coverage: +- Excludes tool/algorithm-fixed metrics (``tool_trajectory_avg_score``, + ``response_match_score``, ``response_evaluation_score``). +- FinalResponseCriterion: text match modes / case sensitivity / ignore / + JSON tree / numeric tolerance / AND combination / custom compare. +- LLMJudgeCriterion: single/multi judge / six built-in aggregators / + parallel / rubrics / knowledge_tool_names / generation_config / think + mode / weights. +""" + +from __future__ import annotations + +import math +from typing import Any +from typing import Optional + +from ._eval_config import EvalConfig +from ._eval_metrics import EvalMetric +from ._eval_metrics import PrebuiltMetrics +from ._evaluator_registry import EVALUATOR_REGISTRY + +_SKIPPED_METRICS: frozenset[str] = frozenset({ + PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + PrebuiltMetrics.RESPONSE_MATCH_SCORE.value, + PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value, +}) + +_METRIC_DESCRIPTIONS: dict[str, str] = { + PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: ("Deterministic match between the agent's final response and the " + "reference answer. Each invocation scores 1.0 (match) or 0.0 (no " + "match); the case score is the mean across invocations."), + PrebuiltMetrics.LLM_FINAL_RESPONSE.value: ("An LLM judge inspects the agent's final response and returns a " + "holistic valid/invalid verdict (1.0 or 0.0) together with a " + "natural-language reason."), + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value: ("An LLM judge scores the agent's final response against a list of " + "rubric items. Each rubric is judged independently (0 or 1); the " + "overall score is the mean of sub-scores. The judge returns a per-" + "rubric reason explaining its verdict."), + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value: + ("An LLM judge inspects the knowledge content the agent retrieved via " + "tool calls and scores it against a list of rubric items. Each " + "rubric is judged independently (0 or 1); the overall score is the " + "mean of sub-scores."), +} + +_AGGREGATOR_EXPLANATIONS: dict[str, str] = { + "all_pass": "all judges must PASS for the metric to PASS (strictest).", + "any_pass": "any single judge passing is enough for the metric to PASS (most lenient).", + "majority_pass": "more than half of the judges must PASS.", + "avg": "arithmetic mean of judges' scores (uniform weighting).", + "weighted_avg": "weighted mean of judges' scores using each model's ``weight``.", + "weighted_majority": "weighted majority vote: passes when the weighted PASS vote exceeds the FAIL vote.", +} + +_HEADER = ("## Metrics Reference\n\n" + "The assistant's outputs are graded by the metrics below. UNDERSTAND THESE " + "BEFORE PROPOSING CHANGES — they determine whether your new instruction " + "improves or regresses the candidate.") + +_FOOTER_GUIDELINES = ("## Rewriting Guidelines\n\n" + "1. **Preserve passing metrics.** A metric currently above its threshold " + "must not be sacrificed to fix a failing one.\n" + "2. **Use per-rubric sub-scores.** When a metric's per-case feedback " + "includes ``rubric_scores``, the failing sub-rubric tells you exactly " + "what's missing — and the passing ones tell you what to keep.\n" + "3. **Criterion-based metrics are deterministic.** The agent's output " + "must literally satisfy the matching rule (a ``contains`` rule means " + "the actual output has to include the expected substring verbatim).\n" + "4. **LLM-judged metrics evaluate qualities.** The judge reads each " + "rubric body literally. To lift a failing rubric you must instruct the " + "agent to visibly exhibit the quality that rubric describes.") + + +def build_metric_reference_doc(eval_config: EvalConfig) -> str: + """Render the metric reference doc as markdown. + + Builds one section per user-configured criterion-based metric (skipping + tool-call and algorithm-fixed metrics). Order is preserved from the user's + configuration. Returns the header alone when no metric is eligible — the + caller still gets a valid doc to inject. + """ + metrics = eval_config.get_eval_metrics() + included = [m for m in metrics if m.metric_name not in _SKIPPED_METRICS] + + if not included: + return _HEADER + "\n\n_No graded metrics with criterion config are registered._\n" + + sections = [_HEADER] + for metric in included: + sections.append(build_metric_section(metric)) + sections.append(_FOOTER_GUIDELINES) + + return "\n\n".join(sections) + + +def build_metric_section(metric: EvalMetric) -> str: + """Render a single metric's section. + + Public to keep tests focused: the section is also unit-testable + independently of the surrounding header/footer. + """ + name = metric.metric_name + threshold = float(metric.threshold) + criterion = metric.criterion or {} + + lines: list[str] = [] + lines.append(f"### Metric: `{name}`") + lines.append("") + lines.append(f"**Type**: {_metric_type(name)}") + description = _METRIC_DESCRIPTIONS.get(name) + if description: + lines.append(f"**Description**: {description}") + lines.append("") + + lines.append("**Scoring algorithm**:") + if name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + lines.extend(_render_final_response_criterion(criterion, metric_name=name)) + elif name in { + PrebuiltMetrics.LLM_FINAL_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + lines.extend(_render_llm_judge_criterion(criterion, metric_name=name)) + lines.append("") + + lines.append("**Score range**: 0.0 ~ 1.0") + lines.append(f"**PASS condition**: score >= {threshold:.4f}") + if name in { + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + n_rubrics = _count_rubrics(criterion) + if n_rubrics > 0: + min_pass = math.ceil(threshold * n_rubrics) + lines.append(f" - With {n_rubrics} rubric item(s), at least **{min_pass}** must pass.") + lines.append("") + + lines.append("**Per-case feedback contains**:") + lines.extend(_render_feedback_fields(name)) + lines.append("") + + lines.append("**What reflection LM should know**:") + lines.extend(_render_reflection_hints(name, criterion)) + + return "\n".join(lines) + + +def _metric_type(name: str) -> str: + if name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + return "criterion-based (deterministic text and/or JSON match)" + if name == PrebuiltMetrics.LLM_FINAL_RESPONSE.value: + return "LLM-judged binary (valid/invalid)" + if name == PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value: + return "LLM-judged rubric scoring (multiple sub-rubrics, score is the mean)" + if name == PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value: + return "LLM-judged rubric scoring over knowledge-retrieval tool outputs" + return "custom" + + +def _render_final_response_criterion(criterion: dict, *, metric_name: str) -> list[str]: + out: list[str] = [] + + if _has_custom_compare(metric_name): + out.append("- **Custom compare function**: registered via " + "``EVALUATOR_REGISTRY.set_criterion_compare``. This callable " + "**overrides** all built-in text/JSON strategies below — the " + "agent's output is judged purely by user code.") + return out + + fr = _pick(criterion, "final_response", "finalResponse") + if not isinstance(fr, dict) or not fr: + out.append("- _No ``final_response`` config provided; the metric will return 0.0 (FAIL)._") + return out + + text = _pick(fr, "text", "text_strategy", "textStrategy") + json_cfg = _pick(fr, "json", "json_strategy", "jsonStrategy") + + if isinstance(text, dict): + out.extend(_render_text_strategy(text)) + if isinstance(json_cfg, dict): + out.extend(_render_json_strategy(json_cfg)) + + if isinstance(text, dict) and isinstance(json_cfg, dict): + out.append("- **Combined**: both text and JSON checks must pass (AND logic). " + "A single failing check fails the case.") + + if not isinstance(text, dict) and not isinstance(json_cfg, dict): + out.append("- _Neither text nor JSON strategy configured; the metric will FAIL by default._") + + return out + + +def _render_text_strategy(text: dict) -> list[str]: + match = str(text.get("match") or text.get("match_strategy") or "exact").strip().lower() + case_insensitive = bool(text.get("case_insensitive") or text.get("caseInsensitive")) + ignored = bool(text.get("ignore")) + + if ignored: + return ["- **Text comparison**: ``ignore=True`` — text check is skipped (always passes)"] + + mode_desc = { + "exact": "actual output must be **byte-equal** to expected", + "contains": "actual output must **contain** expected as a substring", + "regex": "expected is treated as a **regular expression**; matched via ``re.search``", + }.get(match, f"``{match}``") + case_note = "case-insensitive" if case_insensitive else "case-sensitive" + + return [f"- **Text comparison** (``match=\"{match}\"``, {case_note}): {mode_desc}"] + + +def _render_json_strategy(json_cfg: dict) -> list[str]: + if bool(json_cfg.get("ignore")): + return ["- **JSON comparison**: ``ignore=True`` — JSON check is skipped"] + + out = ["- **JSON comparison**: actual and expected are parsed as JSON, then compared structurally"] + ignore_tree = _pick(json_cfg, "ignore_tree", "ignoreTree") + tolerance = _pick(json_cfg, "number_tolerance", "numberTolerance") + if isinstance(ignore_tree, dict) and ignore_tree: + out.append(f" - Keys ignored before compare (``ignore_tree``): ``{ignore_tree}``") + if tolerance is not None: + out.append(f" - Numeric tolerance: {tolerance}") + else: + out.append(" - Numeric tolerance: 1e-6 (default)") + return out + + +def _render_llm_judge_criterion(criterion: dict, *, metric_name: str) -> list[str]: + out: list[str] = [] + + llm = _pick(criterion, "llm_judge", "llmJudge") + if not isinstance(llm, dict) or not llm: + out.append("- _No ``llm_judge`` config provided; the metric will fail to evaluate._") + return out + + single = _pick(llm, "judge_model", "judgeModel") + multi = _pick(llm, "judge_models", "judgeModels") + + if isinstance(multi, list) and multi: + out.append(f"- **Judge models** ({len(multi)} judges, each scores independently):") + for jm in multi: + if isinstance(jm, dict): + out.append(" - " + _format_judge_model(jm)) + agg = str(_pick(llm, "models_aggregator", "modelsAggregator") or "all_pass") + agg_expl = _AGGREGATOR_EXPLANATIONS.get(agg, "custom aggregator (registered separately).") + out.append(f"- **Cross-model aggregator** (``{agg}``): {agg_expl}") + parallel = llm.get("parallel", True) + par_text = ("yes (judges run concurrently)" if parallel else "no (judges run sequentially)") + out.append(f"- **Parallel execution**: {par_text}") + elif isinstance(single, dict): + out.append(f"- **Judge model**: {_format_judge_model(single)}") + else: + out.append("- _No judge model configured._") + + rubrics = llm.get("rubrics") or [] + if isinstance(rubrics, list) and rubrics: + out.append(f"- **Rubric items** ({len(rubrics)} items judged independently, each scored 0 or 1; " + "overall score = mean of sub-scores):") + for i, rubric in enumerate(rubrics, 1): + if not isinstance(rubric, dict): + continue + rid = rubric.get("id", f"rubric_{i}") + desc = rubric.get("description", "") + content = rubric.get("content") or {} + body = content.get("text", "") if isinstance(content, dict) else "" + head = f" {i}. **``{rid}``**" + if desc: + head += f" — {desc}" + out.append(head) + if body: + out.append(f" > {body}") + + if metric_name == PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value: + knowledge_tools = _pick(llm, "knowledge_tool_names", "knowledgeToolNames") + if isinstance(knowledge_tools, list) and knowledge_tools: + out.append("- **Knowledge tools** (judge inspects results from these tool calls): " + f"``{', '.join(knowledge_tools)}``") + else: + out.append("- **Knowledge tools**: default knowledge tool set is used (no override).") + + return out + + +def _format_judge_model(jm: dict) -> str: + model = jm.get("model_name") or jm.get("modelName") or "" + extras: list[str] = [] + + num_samples = jm.get("num_samples") or jm.get("numSamples") + if isinstance(num_samples, int) and num_samples > 1: + extras.append(f"num_samples={num_samples}") + + gen = jm.get("generation_config") or jm.get("generationConfig") or {} + if isinstance(gen, dict): + if "temperature" in gen: + extras.append(f"temperature={gen['temperature']}") + mt = gen.get("max_tokens") or gen.get("maxTokens") + if mt is not None: + extras.append(f"max_tokens={mt}") + + weight = jm.get("weight") + if isinstance(weight, (int, float)) and float(weight) != 1.0: + extras.append(f"weight={weight}") + + think = jm.get("think") + if think is True: + extras.append("think=True") + elif think is False: + extras.append("think=False") + + base = f"``{model}``" + if extras: + return f"{base} ({', '.join(extras)})" + return base + + +def _render_feedback_fields(metric_name: str) -> list[str]: + out = ["- ``metric_name``, ``status`` (PASSED/FAILED), ``score``, ``threshold`` — always present"] + if metric_name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + out.append("- ``reason`` — short string (deterministic comparator; synthesized " + "from the criterion config when the matcher leaves it empty)") + return out + + out.append("- ``reason`` — natural-language explanation written by the LLM judge") + if metric_name in { + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + out.append("- ``rubric_scores`` — per-rubric breakdown; each item has ``id``, " + "``score``, and a ``reason`` written by the judge") + out.append("- ``per_model_scores`` (when multiple judge_models are configured) — " + "each judge's independent score/reason") + return out + + +def _render_reflection_hints(metric_name: str, criterion: dict) -> list[str]: + out: list[str] = [] + + if metric_name == PrebuiltMetrics.FINAL_RESPONSE_AVG_SCORE.value: + if _has_custom_compare(metric_name): + out.append("- Matching is delegated to user-provided Python code; format " + "requirements depend entirely on that comparator.") + return out + fr = _pick(criterion, "final_response", "finalResponse") or {} + text = fr.get("text") if isinstance(fr, dict) else None + match = "" + if isinstance(text, dict): + match = str(text.get("match") or text.get("match_strategy") or "exact").lower() + if match == "exact": + out.append("- Output must be **byte-exact**: stray whitespace or punctuation will FAIL.") + out.append("- Prompt should constrain the agent to emit *only* the expected literal text " + "with no extra prose or formatting.") + elif match == "contains": + out.append("- Output must literally **contain** the expected substring.") + out.append("- Prompt should drive the agent to emit that substring with correct " + "word order, punctuation, and units.") + elif match == "regex": + out.append("- Output is tested with ``re.search``; ensure the agent's response " + "satisfies the regex (think about how greediness and anchoring affect matching).") + if isinstance(fr, dict) and (fr.get("json") or fr.get("json_strategy") or fr.get("jsonStrategy")): + out.append("- JSON comparison is active; when the agent's output is parsed as JSON, " + "structural equality (after ``ignore_tree`` removal) matters.") + return out + + if metric_name == PrebuiltMetrics.LLM_FINAL_RESPONSE.value: + out.append("- The LLM judge gives a holistic verdict; read its ``reason`` for what swayed it.") + out.append("- Align the prompt with the qualities the judge consistently rewards.") + return out + + if metric_name in { + PrebuiltMetrics.LLM_RUBRIC_RESPONSE.value, + PrebuiltMetrics.LLM_RUBRIC_KNOWLEDGE_RECALL.value, + }: + out.append("- The judge reads each rubric body **literally**. To lift a failing rubric, " + "the agent's output must visibly satisfy what that rubric describes.") + out.append("- Do NOT remove qualities currently scoring 1.0 — examine the passing " + "rubrics in the feedback and keep their requirements in your new prompt.") + out.append("- When a rubric is being judged unfairly, prompt the agent to call out " + "the relevant quality explicitly so the judge cannot miss it.") + return out + + +def _count_rubrics(criterion: dict) -> int: + llm = _pick(criterion, "llm_judge", "llmJudge") or {} + if not isinstance(llm, dict): + return 0 + rubrics = llm.get("rubrics") or [] + if not isinstance(rubrics, list): + return 0 + return len(rubrics) + + +def _has_custom_compare(metric_name: str) -> bool: + """Detect whether a user-registered custom compare callable is present. + + Reads the registry's internal map by getattr (no public accessor exists); + falls back to ``False`` if the attribute is missing or non-mapping. + """ + registry = getattr(EVALUATOR_REGISTRY, "_criterion_compares", None) + if not isinstance(registry, dict): + return False + return metric_name in registry + + +_REFLECTION_PROMPT_PREFIX = ("I provided an assistant with the following instruction(s):\n" + "```\n\n```\n") + +_REFLECTION_PROMPT_MID_WITH_DOC = ( + "\n\nThe assistant's output is graded by the metrics described below. " + "READ THEM CAREFULLY — every per-case feedback row references one of these metrics.\n\n") + +_REFLECTION_PROMPT_MID_BARE = ("\n\nBelow are example inputs, the assistant's responses, and per-case feedback " + "summarising how each metric scored the response.\n\n") + +_REFLECTION_PROMPT_FEEDBACK = ("## How to read each example\n\n" + "Every ``# Example N`` block below is a failed case rendered by GEPA " + "as nested markdown headers. The non-self-evident fields:\n\n" + "- ``## score`` is the case-level aggregate on [0, 1] (every metric, " + "every turn, every run rolled into one number); ``1.0`` would mean " + "every metric passed, so all examples here have ``score < 1.0``.\n" + "- ``## Case Body`` — a turn-sliced markdown block; the bulk of the " + "evidence lives here. Format described below.\n" + "- ``## Other Active Components`` *(present iff the candidate has " + "more than one prompt)* — the current text of every prompt OTHER " + "than the one you are about to rewrite (the target prompt is the " + "code-fenced block at the very top of this message). The verdict " + "you see was produced by the agent running with all prompts active, " + "so use these to:\n" + " Ā· avoid restating requirements already enforced elsewhere;\n" + " Ā· avoid contradicting another prompt's instructions;\n" + " Ā· spot gaps that no prompt currently covers.\n" + "- ``## history_top_k`` *(optional, present iff the case has prior " + "high-score runs from earlier candidates)* — a small list of " + "``{score, best_response}`` entries showing what previously scored well " + "on this case. Treat these as anchors: a rewrite that preserves the " + "pattern that produced those high scores is preferable to one that " + "regresses cases the optimizer already solved before.\n\n" + "## Case Body layout\n\n" + "``Case Body`` is a free-text markdown block. Each turn is one " + "``### Turn N`` section containing the conversational truth, the " + "agent's actual behaviour, and the per-turn verdict — kept together " + "so each failing metric is visually anchored to the turn that " + "produced it. Inside one turn:\n\n" + "```\n" + "### Turn N\n" + "**User**: \n" + "**Expected**: \n" + "**Agent Response**: \n" + "**Tool Trace**: (omitted if no tools were used)\n" + "- (=, ...) → [id=]\n" + "**Verdict** (Turn N):\n" + " [PASSED|FAILED] : score=, threshold=\n" + " reason: \n" + " Ā· rubric[]: PASS|FAIL score= reason: \n" + "```\n\n" + "Multi-run cases (``num_runs > 1``) nest each run inside the turn:\n\n" + "```\n" + "### Turn N\n" + "**User**: ...\n" + "**Expected**: ...\n" + "\n" + "#### Run 1\n" + "**Agent Response**: ...\n" + "**Tool Trace**: ...\n" + "**Verdict** (Turn N, Run 1):\n" + " ...\n" + "\n" + "#### Run 2\n" + "...\n" + "```\n\n" + "Multi-turn or multi-run cases close with an ``### Overall`` block " + "(``### Overall (case-level aggregate)`` for single-run, " + "``### Overall (per-run aggregate)`` for multi-run). Single-turn " + "single-run cases skip the Overall block because Turn 1 already " + "carries the only verdict that exists.\n\n" + "## Reading rules\n\n" + "- The reference answer ONLY appears in ``**Expected**``; it is " + "deliberately not echoed inside the Verdict line, so do not look for " + "it there.\n" + "- Every ```` in a Verdict line maps directly to a " + "``### Metric: `` section in the Metrics Reference above " + "— consult it for how the score is computed before deciding what to " + "change.\n" + "- Treat PASSING metrics as constraints, not noise: a rewrite that " + "fixes a FAILING metric while regressing a PASSING one is a " + "regression, not an improvement.\n\n" + "Examples follow:\n" + "```\n\n```\n\n" + "Read each example end-to-end, then rewrite the instruction so PASSING " + "metrics stay passing and FAILING metrics improve. Provide the new " + "instruction inside ``` blocks.\n") + + +def build_reflection_prompt_template(metric_reference_doc: str) -> str: + """Build the prompt template handed to GEPA's reflection LM. + + GEPA fills ```` with the current prompt text and ```` + with the rendered per-case feedback. The metric reference doc is wedged + between them so the LM has: (1) the current prompt, (2) a static metric + syllabus, (3) live per-case feedback, in that order. + + GEPA's ``InstructionProposalSignature.validate_prompt_template`` enforces + that both placeholders are present, so we always keep them — even when + ``metric_reference_doc`` is empty. + """ + doc = (metric_reference_doc or "").strip() + if doc: + middle = _REFLECTION_PROMPT_MID_WITH_DOC + doc + "\n\n" + else: + middle = _REFLECTION_PROMPT_MID_BARE + return _REFLECTION_PROMPT_PREFIX + middle + _REFLECTION_PROMPT_FEEDBACK + + +def _pick(d: dict, *keys: str) -> Optional[Any]: + """Return the first present value among ``keys`` (handles camelCase/snake_case aliases).""" + if not isinstance(d, dict): + return None + for k in keys: + if k in d: + return d[k] + return None diff --git a/trpc_agent_sdk/evaluation/_optimize_model_callable.py b/trpc_agent_sdk/evaluation/_optimize_model_callable.py new file mode 100644 index 0000000..9465dce --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_model_callable.py @@ -0,0 +1,309 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Synchronous LLM callable for optimizer prompt-rewrite operations. + +Conforms to gepa's ``LanguageModel`` Protocol so the same instance +serves as ``reflection_lm`` for ``gepa.optimize``. Internally drives a +framework :class:`LlmAgent` so optimize-model configuration honours +the framework's provider routing, env-variable expansion, and +``extra_fields`` pass-through. +""" + +from __future__ import annotations + +import asyncio +import copy +import os +import uuid +from typing import Any +from typing import Optional +from typing import Union + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.context import InvocationContext +from trpc_agent_sdk.context import create_agent_context +from trpc_agent_sdk.context import new_invocation_context_id +from trpc_agent_sdk.models import ModelRegistry +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.planners import BuiltInPlanner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import HttpOptions +from trpc_agent_sdk.types import Part +from trpc_agent_sdk.types import ThinkingConfig + +from ._optimize_model_options import OptimizeModelOptions + +DEFAULT_OPTIMIZE_MAX_TOKENS = 4096 +DEFAULT_OPTIMIZE_TEMPERATURE = 0.8 + + +def _expand_env(s: str) -> str: + """Expand environment variables in a string (e.g. $VAR or ${VAR}).""" + if not s or not isinstance(s, str): + return s or "" + return os.path.expandvars(s) + + +def _merge_extra_body( + http_options: Optional[HttpOptions], + patch: dict[str, Any], +) -> HttpOptions: + """Deep-merge patch into http_options.extra_body at nested-dict granularity.""" + base = (http_options.extra_body or {}) if http_options is not None else {} + merged: dict[str, Any] = dict(base) + for key, patch_val in patch.items(): + base_val = merged.get(key) + if isinstance(base_val, dict) and isinstance(patch_val, dict): + new_child = dict(base_val) + for subkey, subval in patch_val.items(): + new_child[subkey] = copy.deepcopy(subval) + merged[key] = new_child + else: + merged[key] = copy.deepcopy(patch_val) + if http_options is None: + return HttpOptions(extra_body=merged) + return http_options.model_copy(update={"extra_body": merged}) + + +def _create_optimize_model(opts: OptimizeModelOptions) -> Any: + """Build the underlying LLM model for an optimizer's LLM-driven operations. + + Provider routing: + - provider_name empty or "openai" -> OpenAIModel(...) directly. This + matches the framework's standard pattern for OpenAI-compatible + endpoints and forwards http_options.extra_body to the backend. + - Any other provider_name -> ModelRegistry.create_model("{provider}/{model}") + which routes to LiteLLMModel for multi-provider support. + """ + provider_name = _expand_env(opts.provider_name or "") + model_name = _expand_env(opts.model_name or "") + base_url = _expand_env(opts.base_url or "") + api_key = _expand_env(opts.api_key or "") + extra = dict(opts.extra_fields or {}) + + if not provider_name or provider_name.lower() == "openai": + return OpenAIModel( + model_name=model_name, + api_key=api_key, + base_url=base_url or None, + **extra, + ) + + return ModelRegistry.create_model( + f"{provider_name}/{model_name}", + api_key=api_key, + base_url=base_url or "", + **extra, + ) + + +# yapf: disable +def _build_optimize_generation_config( + opts: OptimizeModelOptions, +) -> tuple[GenerateContentConfig, Optional[ThinkingConfig]]: + # yapf: enable + """Build (GenerateContentConfig, ThinkingConfig | None) from OptimizeModelOptions. + + Returns thinking_config separately because LlmAgent rejects it on + GenerateContentConfig and requires it via BuiltInPlanner. + + Resolution order: + 1. Base fields (max_tokens/temperature/top_p/stop/...) from generation_config. + 2. thinking_config dict -> candidate ThinkingConfig (not written to cfg). + 3. http_options dict -> cfg.http_options (if present). + 4. opts.think overrides both paths when set. + """ + gen = opts.generation_config or {} + cfg = GenerateContentConfig() + cfg.max_output_tokens = (gen.get("max_tokens") or gen.get("max_output_tokens") or DEFAULT_OPTIMIZE_MAX_TOKENS) + cfg.temperature = gen.get("temperature", DEFAULT_OPTIMIZE_TEMPERATURE) + if "top_p" in gen and gen["top_p"] is not None: + cfg.top_p = gen["top_p"] + if "stop" in gen and gen["stop"] is not None: + cfg.stop_sequences = (gen["stop"] if isinstance(gen["stop"], list) else [gen["stop"]]) + elif "stop_sequences" in gen and gen["stop_sequences"] is not None: + cfg.stop_sequences = gen["stop_sequences"] + if "presence_penalty" in gen and gen["presence_penalty"] is not None: + setattr(cfg, "presence_penalty", gen["presence_penalty"]) + if "frequency_penalty" in gen and gen["frequency_penalty"] is not None: + setattr(cfg, "frequency_penalty", gen["frequency_penalty"]) + + effective_thinking_config: Optional[ThinkingConfig] = None + tc_dict = gen.get("thinking_config") + if isinstance(tc_dict, dict): + effective_thinking_config = ThinkingConfig(**tc_dict) + + http_opts_dict = gen.get("http_options") + if isinstance(http_opts_dict, dict): + cfg.http_options = HttpOptions(**http_opts_dict) + + if opts.think is True: + effective_thinking_config = ThinkingConfig( + include_thoughts=True, + thinking_budget=-1, + ) + cfg.http_options = _merge_extra_body( + cfg.http_options, + {"chat_template_kwargs": { + "enable_thinking": True + }}, + ) + elif opts.think is False: + effective_thinking_config = ThinkingConfig( + include_thoughts=False, + thinking_budget=0, + ) + cfg.http_options = _merge_extra_body( + cfg.http_options, + {"chat_template_kwargs": { + "enable_thinking": False + }}, + ) + + return cfg, effective_thinking_config + + +def _extract_final_text(event: Any) -> str: + """Collect non-thought text from a single LlmAgent final-response event. + + Returns empty string when the event is not a final response, lacks content, + or contains only thought parts. + """ + if not event.is_final_response(): + return "" + if not event.content or not event.content.parts: + return "" + return "\n".join((p.text or "").strip() for p in event.content.parts if p.thought is not True).strip() + + +def _flatten_messages(prompt: Union[str, list[dict[str, Any]]]) -> str: + """Flatten gepa's prompt forms into a single user-text string. + + Accepts: + - str: returned verbatim + - list[dict]: messages with role/content; joined with role tags so the + downstream LlmAgent receives a single user turn that preserves the + original conversation structure + """ + if isinstance(prompt, str): + return prompt + if not isinstance(prompt, list): + return str(prompt) + parts: list[str] = [] + for msg in prompt: + if not isinstance(msg, dict): + parts.append(str(msg)) + continue + role = msg.get("role", "user") + content = msg.get("content", "") + if isinstance(content, list): + content = "".join(c.get("text", str(c)) for c in content if isinstance(c, dict)) + parts.append(f"[{role}]\n{content}") + return "\n\n".join(parts) + + +class _OptimizeModelCallable: + """Synchronous LLM callable wrapping a framework `LlmAgent`. + + Conforms to gepa's `LanguageModel` Protocol: + - `__call__(prompt: str | list[dict]) -> str` + - `total_cost: float` attribute (used by gepa's MaxReflectionCostStopper) + + LlmAgent topology: instruction = "" (callers embed their own system text + inside the prompt), single user turn, no tools, no planner unless + `think` requests one, output_schema = None. + """ + + def __init__(self, opts: OptimizeModelOptions) -> None: + model = _create_optimize_model(opts) + cfg, thinking_config = _build_optimize_generation_config(opts) + planner = (BuiltInPlanner(thinking_config=thinking_config) if thinking_config is not None else None) + self._agent = LlmAgent( + name="optimize_model", + model=model, + instruction="", + generate_content_config=cfg, + add_name_to_instruction=False, + output_schema=None, + tools=[], + planner=planner, + ) + self._session_service = InMemorySessionService() + self.total_cost: float = 0.0 + self.total_calls: int = 0 + self.total_token_usage: dict[str, int] = { + "prompt": 0, + "completion": 0, + "total": 0, + } + + def __call__(self, prompt: Union[str, list[dict[str, Any]]]) -> str: + user_text = _flatten_messages(prompt) + self.total_calls += 1 + return asyncio.run(self._run_async(user_text)) + + async def _run_async(self, user_text: str) -> str: + user_content = Content(role="user", parts=[Part.from_text(text=user_text)]) + agent_context = create_agent_context() + session = await self._session_service.create_session( + app_name="optimizer", + user_id="optimize_model", + session_id=str(uuid.uuid4()), + agent_context=agent_context, + ) + ctx = InvocationContext( + session_service=self._session_service, + invocation_id=new_invocation_context_id(), + agent=self._agent, + session=session, + agent_context=agent_context, + user_content=user_content, + override_messages=[user_content], + ) + last_text = "" + async for event in self._agent.run_async(ctx): + part_text = _extract_final_text(event) + if part_text: + last_text += part_text + usage = getattr(event, "usage_metadata", None) + if usage is not None: + self._accumulate_usage(usage) + return last_text.strip() + + def _accumulate_usage(self, usage: Any) -> None: + """Add a single ``usage_metadata`` snapshot into ``total_token_usage``. + + Tolerant to Pydantic models, dict, or arbitrary attribute-bearing + objects so it works across model providers. + """ + prompt = self._read_count(usage, ("prompt_token_count", "input_tokens", "prompt_tokens")) + completion = self._read_count( + usage, + ("candidates_token_count", "output_tokens", "completion_tokens"), + ) + total = self._read_count(usage, ("total_token_count", "total_tokens")) + if total <= 0 and (prompt > 0 or completion > 0): + total = prompt + completion + self.total_token_usage["prompt"] += prompt + self.total_token_usage["completion"] += completion + self.total_token_usage["total"] += total + + @staticmethod + def _read_count(usage: Any, names: tuple[str, ...]) -> int: + """Return the first non-None int among the candidate attribute / key names.""" + for name in names: + value = None + if isinstance(usage, dict): + value = usage.get(name) + else: + value = getattr(usage, name, None) + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + return 0 diff --git a/trpc_agent_sdk/evaluation/_optimize_model_options.py b/trpc_agent_sdk/evaluation/_optimize_model_options.py new file mode 100644 index 0000000..7e15c54 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_model_options.py @@ -0,0 +1,45 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""LLM options for the optimizer's prompt rewrite step.""" + +from __future__ import annotations + +from typing import Any +from typing import Optional + +from pydantic import Field + +from ._common import EvalBaseModel + + +class OptimizeModelOptions(EvalBaseModel): + """LLM configuration for proposing new prompt candidates.""" + + provider_name: str = Field(default="", description="LLM provider name.") + model_name: str = Field(default="", description="Model name.") + variant: str = Field(default="", description="OpenAI-compatible variant when provider is openai.") + base_url: Optional[str] = Field(default=None, description="Custom endpoint URL.") + api_key: str = Field(default="", description="API key.") + extra_fields: Optional[dict[str, Any]] = Field( + default=None, + description="Extra provider-specific fields.", + ) + num_samples: Optional[int] = Field( + default=None, + description="Number of samples per call.", + ) + generation_config: Optional[dict[str, Any]] = Field( + default=None, + description="Generation params: max_tokens, temperature, stream, etc.", + ) + weight: float = Field( + default=1.0, + description="Weight for aggregation across samples.", + ) + think: Optional[bool] = Field( + default=None, + description="Thinking mode toggle. None: no change; False: disable; True: enable.", + ) diff --git a/trpc_agent_sdk/evaluation/_optimize_registrations.py b/trpc_agent_sdk/evaluation/_optimize_registrations.py new file mode 100644 index 0000000..74df687 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_registrations.py @@ -0,0 +1,22 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Central registration of optimizer algorithms. + +Each algorithm is registered under ``try/except ImportError`` so optional +third-party deps that are missing simply omit the algorithm rather than +breaking package import. +""" + +from __future__ import annotations + +from ._optimize_registry import OPTIMIZER_REGISTRY + +try: + from ._optimize_gepa_reflective import GepaReflectiveOptimizer +except ImportError: + pass +else: + OPTIMIZER_REGISTRY.register("gepa_reflective", GepaReflectiveOptimizer) diff --git a/trpc_agent_sdk/evaluation/_optimize_registry.py b/trpc_agent_sdk/evaluation/_optimize_registry.py new file mode 100644 index 0000000..d1c7239 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_registry.py @@ -0,0 +1,41 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Registry mapping optimizer algorithm name to BaseOptimizer subclass.""" + +from __future__ import annotations + +import inspect +from typing import Type + +from ._base_optimizer import BaseOptimizer + + +class OptimizerRegistry: + """Maps optimizer algorithm name to a BaseOptimizer subclass.""" + + def __init__(self) -> None: + self._registry: dict[str, Type[BaseOptimizer]] = {} + + def register(self, name: str, optimizer_class: Type[BaseOptimizer]) -> None: + """Register an optimizer class under the given algorithm name.""" + if not inspect.isclass(optimizer_class) or not issubclass(optimizer_class, BaseOptimizer): + raise TypeError(f"optimizer_class must be a subclass of BaseOptimizer, " + f"got {optimizer_class!r}") + self._registry[name] = optimizer_class + + def list_registered(self) -> list[str]: + """Return sorted algorithm names currently registered.""" + return sorted(self._registry.keys()) + + def get(self, name: str) -> Type[BaseOptimizer]: + """Return the optimizer class registered under name; raise if absent.""" + if name not in self._registry: + raise ValueError(f"No optimizer registered for algorithm: {name}. " + f"Available algorithms: {self.list_registered()}") + return self._registry[name] + + +OPTIMIZER_REGISTRY = OptimizerRegistry() diff --git a/trpc_agent_sdk/evaluation/_optimize_reporter.py b/trpc_agent_sdk/evaluation/_optimize_reporter.py new file mode 100644 index 0000000..51a3ca9 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_reporter.py @@ -0,0 +1,1001 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Algorithm-agnostic progress sink for AgentOptimizer. + +Defines :class:`OptimizeReporter` (the surface algorithms emit progress +events to) and three concrete backends: + + * :class:`_NullReporter` drops every event (``verbose=0``). + * :class:`_RichReporter` Rich panel header, Live progress bar over + the budget, colourised round lines, closing + summary panel with per-metric comparison. + * :class:`_AsciiReporter` plain-``print`` fallback for non-Rich + environments. + +:class:`_SilentGepaLogger` is a ``gepa.LoggerProtocol``-compatible sink +the optimizer hands to gepa to keep library logs out of the reporter +timeline. + +:func:`create_reporter` picks a backend by ``verbose`` level and ``rich`` +availability. +""" + +from __future__ import annotations + +import logging +import os +import sys +from dataclasses import dataclass +from dataclasses import field +from typing import TYPE_CHECKING +from typing import Any +from typing import Literal +from typing import Optional +from typing import Protocol +from typing import TextIO +from typing import runtime_checkable + +if TYPE_CHECKING: + from ._optimize_result import OptimizeResult + +logger = logging.getLogger(__name__) + +_GEPA_LOGGER_NAME = "trpc_agent_sdk.optimizer.gepa" + +_MAX_TARGET_FIELDS_IN_HEADER = 8 +_FIELD_NAME_DISPLAY_LIMIT = 40 + + +@dataclass(frozen=True) +class RunHeader: + """Static run context shown at run start. + + Attributes: + algorithm: Registered algorithm name (e.g. ``gepa_reflective``). + target_fields: Ordered ``(field_name, source_repr)`` pairs; + ``source_repr`` is the file path for ``add_path`` fields or + ``""`` for ``add_callback`` fields. + train_size: Training case count. + val_size: Validation case count. + metric_names: Display names of every reported metric. + output_dir: Resolved artifact directory. + budget_total: Configured metric-call budget (e.g. + ``max_metric_calls``); ``None`` falls back to an + indeterminate progress display. + """ + + algorithm: str + target_fields: list[tuple[str, str]] + train_size: int + val_size: int + metric_names: list[str] + output_dir: str + budget_total: Optional[int] = None + + +@dataclass(frozen=True) +class RoundView: + """Single-round summary for one per-round line. + + Attributes: + round: 1-based round index from the algorithm. + kind: ``"reflective"`` (default) or ``"merge"``; unknown values + render as ``"reflective"``. + train_minibatch_size: ``M`` in ``train(M/N)``; 0 when the round + skipped before sampling. + train_size: ``N`` — full training set size. + train_subsample_parent_score: Parent's score on the minibatch + (None when no subsample produced). + train_subsample_candidate_score: New candidate's score (None + when not evaluated). + val_pass_rate: Full validation pass rate when the candidate + cleared the subsample gate (None otherwise). + accepted: True iff the candidate joined the pool. + skip_reason: Human-readable reason for skipped rounds. + error_message: Set when the round ended in an error. + duration_seconds: Wall-clock seconds. + budget_used: Cumulative metric calls used (None when the + algorithm doesn't track a budget). + budget_total: Configured ``max_metric_calls`` (None means + ``"auto"``). + extras: Free-form algorithm-specific payload. + """ + + round: int + kind: Literal["reflective", "merge"] + train_minibatch_size: int + train_size: int + train_subsample_parent_score: Optional[float] + train_subsample_candidate_score: Optional[float] + val_pass_rate: Optional[float] + accepted: bool + skip_reason: Optional[str] + error_message: Optional[str] + duration_seconds: float + budget_used: Optional[int] + budget_total: Optional[int] + extras: dict[str, Any] = field(default_factory=dict) + + +@runtime_checkable +class OptimizeReporter(Protocol): + """Five-event surface every backend implements. + + Implementations swallow render errors; the facade also guards each + call so a broken reporter never breaks optimization. + """ + + def run_started(self, header: RunHeader) -> None: + ... + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + ... + + def round_completed(self, view: RoundView) -> None: + ... + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + ... + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + ... + + +class _NullReporter: + """No-op reporter used when ``verbose=0``.""" + + def run_started(self, header: RunHeader) -> None: + return None + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + return None + + def round_completed(self, view: RoundView) -> None: + return None + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + return None + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + return None + + +def _truncate(text: str, limit: int) -> str: + """Return ``text`` shortened to at most ``limit`` characters with ellipsis.""" + if len(text) <= limit: + return text + return text[:max(0, limit - 3)] + "..." + + +def _format_source(source_repr: str) -> str: + """Compact a target source for display in the run header. + + File-backed sources collapse to their basename (full path remains in + ``config.snapshot.json`` / ``result.json``); callback sources keep their + sentinel ```` form. + """ + if source_repr == "": + return source_repr + return os.path.basename(source_repr) or source_repr + + +def _format_sample_score_segment(view: RoundView, *, ascii_only: bool) -> str: + """Render the ``sample score parent → candidate`` segment, or empty when absent.""" + parent = view.train_subsample_parent_score + candidate = view.train_subsample_candidate_score + if parent is None and candidate is None: + return "" + arrow = "->" if ascii_only else "→" + if parent is None: + return f"sample score {candidate:.2f}" + if candidate is None: + return f"sample score {parent:.2f}" + return f"sample score {parent:.2f} {arrow} {candidate:.2f}" + + +def _format_evaluations_segment(view: RoundView) -> str: + """Render the trailing ``evaluations used/total`` segment, or empty when not tracked.""" + if view.budget_used is None: + return "" + total = "auto" if view.budget_total is None else str(view.budget_total) + return f"evaluations {view.budget_used}/{total}" + + +def _round_marker(view: RoundView, *, ascii_only: bool) -> str: + """Return the leading marker glyph for a round line. + + Glyph → meaning mapping (kept identical between ASCII and Rich): + + * ``āœ“`` accepted — candidate beat the current best on valset. + * ``ā—‹`` explored — full valset evaluation ran but did not improve. + * ``Ā·`` skipped — subsample gate / no-proposal / cache hit etc. + * ``↻`` merge — gepa system-aware merge round. + * ``āœ—`` error — round ended in an algorithm error. + """ + if view.error_message: + return "x" if ascii_only else "āœ—" + if view.skip_reason: + return "." if ascii_only else "Ā·" + if view.kind == "merge": + return "~" if ascii_only else "↻" + if view.accepted: + return "OK" if ascii_only else "āœ“" + return "-" if ascii_only else "ā—‹" + + +def _round_status_word(view: RoundView) -> str: + """Return the textual status label rendered next to the round marker.""" + if view.error_message: + return "error" + if view.skip_reason: + return "skipped" + if view.kind == "merge": + return "merged" if view.accepted else "merge" + if view.accepted: + return "accepted" + return "explored" + + +def _format_stop_reason_text(stop_reason: Optional[str]) -> Optional[str]: + """Translate ``OptimizeResult.stop_reason`` into the reporter row text. + + Returns ``None`` when no row should be emitted (i.e. the run errored + before any stopper could classify a reason). + """ + if stop_reason is None: + return None + text_by_reason = { + "required_metrics_passing": "required metrics met thresholds", + "budget_exhausted": "budget exhausted (max_metric_calls reached)", + "no_improvement": "no improvement for the configured number of rounds", + "timeout": "timeout reached", + "score_threshold": "score threshold reached", + "max_candidate_proposals": "max candidate proposals reached", + "max_tracked_candidates": "max tracked candidates reached", + "user_requested_stop": "user requested stop (optimize.stop touched)", + "completed": "completed (no stopper triggered)", + } + return text_by_reason.get(stop_reason, stop_reason) + + +def _round_legend_lines(*, ascii_only: bool) -> list[str]: + """Return the static legend block describing round-line semantics. + + Printed once between header and baseline so users can decode every + subsequent round line without scrolling back. + """ + arrow = "->" if ascii_only else "→" + accepted = "OK" if ascii_only else "āœ“" + explored = "-" if ascii_only else "ā—‹" + skipped = "." if ascii_only else "Ā·" + merge = "~" if ascii_only else "↻" + error = "x" if ascii_only else "āœ—" + return [ + "Round line legend:", + f" format : round N train sample M/N " + f"sample score parent {arrow} candidate " + f"valset pass_rate Z evaluations used/total duration", + f" status : {accepted} accepted {explored} explored " + f"{skipped} skipped {merge} merge {error} error", + " train : a minibatch of M cases sampled from the N-case training set " + "for the reflective step.", + " sample : parent vs new candidate score on that minibatch " + "(skip gate decides whether to run valset).", + " valset : pass_rate over the full validation set when the candidate " + "cleared the skip gate.", + " budget : evaluations used / configured budget (metric calls).", + ] + + +def _improvement_arrow(delta: float, *, ascii_only: bool) -> str: + """Return the directional arrow for a pass-rate delta.""" + if delta > 0: + return "^" if ascii_only else "ā–²" + if delta < 0: + return "v" if ascii_only else "ā–¼" + return "=" + + +def _format_improvement_label(delta: float) -> str: + """Return a textual label describing the improvement direction.""" + if delta > 0: + return "improved" + if delta < 0: + return "regressed" + return "no improvement" + + +def _format_round_line(view: RoundView, *, ascii_only: bool) -> str: + """Render a single-line per-round summary in ASCII form. + + Layout: `` round N train sample M/N sample score X -> Y + evaluations U/T ``. Segments + that do not apply to the current round (e.g. ``sample score`` for skipped + rounds without subsample data) are omitted. + """ + marker = _round_marker(view, ascii_only=ascii_only) + status_word = _round_status_word(view) + head = f"{marker} round {view.round} {status_word}" + + segments: list[str] = [] + if view.train_minibatch_size > 0: + segments.append(f"train sample {view.train_minibatch_size}/{view.train_size}") + sample = _format_sample_score_segment(view, ascii_only=ascii_only) + if sample: + segments.append(sample) + + if view.error_message: + segments.append(f"message: {view.error_message}") + elif view.skip_reason: + segments.append(f"reason: {view.skip_reason}") + elif view.val_pass_rate is not None: + segments.append(f"valset pass_rate {view.val_pass_rate:.4f}") + + evaluations = _format_evaluations_segment(view) + if evaluations: + segments.append(evaluations) + + body = " ".join(segments) + tail = f" {view.duration_seconds:.1f}s" + return f"{head} {body}{tail}" + + +def _ordered_metric_keys(*breakdowns: dict[str, float], extra: Optional[list[str]] = None) -> list[str]: + """Stable union of metric keys across baseline/best breakdowns and an + optional ``extra`` ordering hint.""" + seen: dict[str, None] = {} + if extra: + for name in extra: + seen.setdefault(name, None) + for breakdown in breakdowns: + for name in breakdown.keys(): + seen.setdefault(name, None) + return list(seen.keys()) + + +def _format_score(value: Optional[float]) -> str: + """Return a fixed-width formatted metric score, or ``-`` when missing.""" + if value is None: + return " - " + return f"{value:.4f}" + + +def _format_delta(value: float, *, ascii_only: bool) -> tuple[str, str]: + """Return a ``(arrow, text)`` pair describing a per-metric improvement.""" + arrow = _improvement_arrow(value, ascii_only=ascii_only) + sign = "+" if value >= 0 else "" + return arrow, f"{sign}{value:.4f}" + + +def _baseline_metric_status( + score: Optional[float], + threshold: Optional[float], + *, + ascii_only: bool, +) -> str: + """Return ``PASS`` / ``FAIL`` (or ``-``) based on whether ``score`` cleared the threshold. + + Mirrors evaluator semantics (``PASSED if score >= threshold``) so the + reporter never disagrees with the evaluator's own PASS / FAIL decision. + """ + if score is None or threshold is None: + return " - " + if score >= threshold: + return "PASS" if ascii_only else "PASS" + return "FAIL" if ascii_only else "FAIL" + + +class _AsciiReporter: + """Dependency-free reporter used as fallback for non-Rich environments. + + Renders every event as ordered plain text via ``print``; safe for log + files and CI pipes. Falls back to ASCII glyphs when the stream encoding + cannot represent the Unicode marker set. + """ + + def __init__(self, *, stream: TextIO = sys.stdout, verbose: int = 1) -> None: + self._stream = stream + self._verbose = verbose + self._ascii_only = self._detect_ascii_only() + + def _detect_ascii_only(self) -> bool: + """Return True when the stream encoding cannot render Unicode glyphs.""" + encoding = getattr(self._stream, "encoding", None) or sys.getdefaultencoding() + try: + "āœ“āœ—Ā·ā†»ā–²ā–¼ā—‹".encode(encoding) + except (LookupError, UnicodeEncodeError): + return True + return False + + def run_started(self, header: RunHeader) -> None: + lines = [ + "", + "=" * 80, + f" AgentOptimizer Ā· {header.algorithm}", + "=" * 80, + self._format_targets_line(header.target_fields), + ] + for name, src in header.target_fields[:_MAX_TARGET_FIELDS_IN_HEADER]: + display_name = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + lines.append(f" - {display_name:<40s} ({_format_source(src)})") + if len(header.target_fields) > _MAX_TARGET_FIELDS_IN_HEADER: + extra = len(header.target_fields) - _MAX_TARGET_FIELDS_IN_HEADER + lines.append(f" ... and {extra} more") + lines.append(f" train/val : {header.train_size} / {header.val_size} cases") + lines.append(f" metrics : {len(header.metric_names)} configured") + for name in header.metric_names: + lines.append(f" - {name}") + if header.budget_total is not None: + lines.append(f" budget : {header.budget_total} metric calls") + else: + lines.append(" budget : auto (no explicit cap)") + lines.append(f" output_dir : {header.output_dir}") + lines.append("-" * 80) + lines.append("") + lines.extend(_round_legend_lines(ascii_only=self._ascii_only)) + lines.append("") + self._writelines(lines) + + @staticmethod + def _format_targets_line(target_fields: list[tuple[str, str]]) -> str: + if len(target_fields) == 1: + return " target : 1 field" + return f" targets : {len(target_fields)} fields" + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + thresholds = metric_thresholds or {} + lines = [f"baseline pass_rate = {pass_rate:.4f}"] + keys = _ordered_metric_keys(metric_breakdown, extra=list(thresholds.keys())) + if keys: + lines.append(" per-metric (threshold | score | status):") + for name in keys: + score = metric_breakdown.get(name) + threshold = thresholds.get(name) + status = _baseline_metric_status(score, threshold, ascii_only=self._ascii_only) + threshold_str = (f"{threshold:.4f}" if threshold is not None else " - ") + score_str = _format_score(score) + lines.append(f" - {name:<40s} threshold {threshold_str} " + f"{score_str} {status}") + lines.append("") + self._writelines(lines) + + def round_completed(self, view: RoundView) -> None: + self._writelines([_format_round_line(view, ascii_only=self._ascii_only)]) + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + self._writelines([""]) + self._writelines(self._build_summary_lines( + result=result, + output_dir=output_dir, + update_source=update_source, + )) + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + self._writelines([ + "", + "=" * 80, + " Optimization FAILED", + "=" * 80, + f" error : {error_message}", + f" output_dir : {output_dir}", + f" baseline preserved at {os.path.join(output_dir, 'baseline_prompts')}", + "=" * 80, + "", + ]) + + def _build_summary_lines( + self, + *, + result: "OptimizeResult", + output_dir: str, + update_source: bool, + ) -> list[str]: + """Return the multi-line summary block printed at run finish.""" + arrow = _improvement_arrow(result.pass_rate_improvement, ascii_only=self._ascii_only) + label = _format_improvement_label(result.pass_rate_improvement) + accepted = sum(1 for r in result.rounds if r.accepted) + sign = "+" if result.pass_rate_improvement >= 0 else "" + rate_line = (f" pass_rate : {result.baseline_pass_rate:.4f} -> {result.best_pass_rate:.4f}" + f" {arrow} {sign}{result.pass_rate_improvement:.4f} ({label})") + lines = [ + "=" * 80, + f" Optimization complete Ā· {result.status}", + "=" * 80, + rate_line, + f" rounds : {accepted} accepted / {result.total_rounds} total", + f" duration : {result.duration_seconds:.2f}s", + ] + stop_text = _format_stop_reason_text(result.stop_reason) + if stop_text is not None: + lines.append(f" stopped by : {stop_text}") + if result.status != "SUCCEEDED" and result.error_message: + lines.append(f" error : {result.error_message}") + metric_keys = _ordered_metric_keys( + result.baseline_metric_breakdown, + result.best_metric_breakdown, + extra=list(result.metric_thresholds.keys()), + ) + if metric_keys: + lines.append(" per-metric : threshold | baseline -> best | delta | status") + for name in metric_keys: + base = result.baseline_metric_breakdown.get(name) + best = result.best_metric_breakdown.get(name) + threshold = result.metric_thresholds.get(name) + delta = (best or 0.0) - (base or 0.0) + d_arrow, d_text = _format_delta(delta, ascii_only=self._ascii_only) + base_str = _format_score(base) + best_str = _format_score(best) + threshold_str = (f"{threshold:.4f}" if threshold is not None else " - ") + status = _baseline_metric_status(best, threshold, ascii_only=self._ascii_only) + lines.append(f" - {name:<40s} threshold {threshold_str} " + f"{base_str} -> {best_str} {d_arrow} {d_text} {status}") + update_msg = self._format_update_source_line(result=result, output_dir=output_dir, update_source=update_source) + if update_msg: + lines.append(update_msg) + lines.extend(self._format_artifacts_block(result=result, output_dir=output_dir)) + lines.append("=" * 80) + lines.append("") + return lines + + @staticmethod + def _format_update_source_line( + *, + result: "OptimizeResult", + output_dir: str, + update_source: bool, + ) -> Optional[str]: + """Return the ``update_source`` row text or ``None`` to omit it.""" + if not update_source: + best_dir = os.path.join(output_dir, "best_prompts") + return f" update_source: false (best prompts at {best_dir}/)" + if result.status == "SUCCEEDED": + return " update_source: true (best written back to target sources)" + return " update_source: true (run failed; sources restored from baseline)" + + @staticmethod + def _format_artifacts_block( + *, + result: "OptimizeResult", + output_dir: str, + ) -> list[str]: + """Return the artifact directory listing lines for the summary.""" + lines = [" artifacts :"] + lines.append(f" {output_dir}/") + for name, content in result.best_prompts.items(): + display = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + lines.append(f" best_prompts/{display}.md ({len(content)} chars)") + lines.append(" result.json summary.txt rounds/ run.log") + return lines + + def _writelines(self, lines: list[str]) -> None: + """Write a list of lines to the stream, swallowing render errors.""" + try: + self._stream.write("\n".join(lines)) + self._stream.write("\n") + try: + self._stream.flush() + except (AttributeError, ValueError): # pragma: no cover - non-flushable buffers + pass + except Exception: # pragma: no cover - never break optimization on render error + logger.warning("AsciiReporter write failed", exc_info=True) + + +class _RichReporter: + """Rich-backed reporter that degrades to plain output on non-TTY streams. + + Uses Rich panels for the header and the closing summary, a Live region + with a progress bar over the configured metric-call budget for the + duration of the run, and a single coloured line per round. The underlying + ``rich.console.Console`` auto-detects whether the stream supports ANSI + sequences. + """ + + def __init__(self, *, stream: TextIO = sys.stdout, verbose: int = 1) -> None: + from rich.console import Console + + self._stream = stream + self._verbose = verbose + self._console = Console( + file=stream, + force_terminal=None, + highlight=False, + soft_wrap=False, + ) + self._ascii = _AsciiReporter(stream=stream, verbose=verbose) + self._progress = None + self._budget_task = None + self._budget_total: Optional[int] = None + + def run_started(self, header: RunHeader) -> None: + from rich.panel import Panel + from rich.table import Table + from rich import box + from rich.progress import ( + Progress, + BarColumn, + TextColumn, + TimeElapsedColumn, + ) + + table = Table.grid(padding=(0, 2)) + table.add_column(no_wrap=True, style="dim") + table.add_column(no_wrap=False) + + targets_label = ("target" if len(header.target_fields) == 1 else "targets") + targets_value = ("1 field" if len(header.target_fields) == 1 else f"{len(header.target_fields)} fields") + table.add_row(targets_label, targets_value) + visible = header.target_fields[:_MAX_TARGET_FIELDS_IN_HEADER] + for name, src in visible: + display_name = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + table.add_row("", f"- {display_name} [dim]({_format_source(src)})[/dim]") + if len(header.target_fields) > len(visible): + remainder = len(header.target_fields) - len(visible) + table.add_row("", f"[dim]... and {remainder} more[/dim]") + + table.add_row("train/val", f"{header.train_size} / {header.val_size} cases") + metric_count_label = ("metric" if len(header.metric_names) == 1 else f"metrics ({len(header.metric_names)})") + table.add_row(metric_count_label, "") + for name in header.metric_names: + table.add_row("", f"- {name}") + + budget_text = (f"{header.budget_total} metric calls" + if header.budget_total is not None else "auto (no explicit cap)") + table.add_row("budget", budget_text) + table.add_row("output_dir", header.output_dir) + + panel = Panel( + table, + title=f"[bold]AgentOptimizer[/bold] Ā· [cyan]{header.algorithm}[/cyan]", + box=box.ROUNDED, + padding=(0, 1), + ) + self._console.print(panel) + self._console.print("") + for line in _round_legend_lines(ascii_only=False): + self._console.print(f"[dim]{line}[/dim]") + self._console.print("") + + self._budget_total = header.budget_total + # ``auto_refresh=False`` keeps the Live region quiescent between + # explicit refresh calls — embedded IDE terminals and some CI + # captures don't honour rich's cursor-up escapes, so the default + # 10Hz auto-refresh would re-print the bar instead of erasing + # it. Manual refresh on each ``round_completed`` keeps the + # output bounded to one line per event. + self._progress = Progress( + TextColumn("[bold]progress[/bold]"), + BarColumn(bar_width=None), + TextColumn("{task.completed}/{task.total} metric calls"), + TextColumn("•"), + TimeElapsedColumn(), + console=self._console, + transient=False, + expand=True, + auto_refresh=False, + ) + total = header.budget_total if header.budget_total is not None else 100 + self._budget_task = self._progress.add_task("budget", total=total) + try: + self._progress.start() + self._progress.refresh() + except Exception: # pragma: no cover - Live region best-effort + self._progress = None + self._budget_task = None + + def baseline_evaluated( + self, + pass_rate: float, + metric_breakdown: dict[str, float], + *, + metric_thresholds: Optional[dict[str, float]] = None, + ) -> None: + from rich.table import Table + from rich import box + + thresholds = metric_thresholds or {} + self._console.print(f"[bold]baseline pass_rate = {pass_rate:.4f}[/bold]") + keys = _ordered_metric_keys(metric_breakdown, extra=list(thresholds.keys())) + if keys: + t = Table(box=box.SIMPLE, show_header=True, header_style="dim") + t.add_column("metric", no_wrap=True) + t.add_column("threshold", justify="right") + t.add_column("baseline", justify="right") + t.add_column("status", justify="right") + for name in keys: + score = metric_breakdown.get(name) + threshold = thresholds.get(name) + threshold_str = (f"{threshold:.4f}" if threshold is not None else "-") + score_str = (f"{score:.4f}" if score is not None else "-") + status = _baseline_metric_status(score, threshold, ascii_only=False) + color = ("green" if status == "PASS" else "red" if status == "FAIL" else "dim") + t.add_row( + name, + threshold_str, + score_str, + f"[{color}]{status}[/{color}]", + ) + self._console.print(t) + self._console.print("") + + def round_completed(self, view: RoundView) -> None: + if self._progress is not None and view.budget_used is not None: + try: + if self._budget_total is None: + # When no upper bound was set, grow the bar with usage. + self._progress.update( + self._budget_task, + completed=view.budget_used, + total=max(view.budget_used, 1), + ) + else: + self._progress.update( + self._budget_task, + completed=min(view.budget_used, self._budget_total), + ) + # Explicit refresh because ``auto_refresh=False`` keeps + # the Live region quiescent between events. + self._progress.refresh() + except Exception: # pragma: no cover + pass + + marker = _round_marker(view, ascii_only=False) + status_word = _round_status_word(view) + style = self._round_style(view) + head = (f"[{style}]{marker} round {view.round} {status_word}[/{style}]") + + segments: list[str] = [] + if view.train_minibatch_size > 0: + segments.append(f"train sample {view.train_minibatch_size}/{view.train_size}") + sample = _format_sample_score_segment(view, ascii_only=False) + if sample: + segments.append(sample) + if view.error_message: + segments.append(f"[red]message: {view.error_message}[/red]") + elif view.skip_reason: + segments.append(f"[dim]reason: {view.skip_reason}[/dim]") + elif view.val_pass_rate is not None: + segments.append(f"[green]valset pass_rate {view.val_pass_rate:.4f}[/green]") + evaluations = _format_evaluations_segment(view) + if evaluations: + segments.append(f"[dim]{evaluations}[/dim]") + body = " ".join(segments) + tail = f" [dim]{view.duration_seconds:.1f}s[/dim]" + self._console.print(f"{head} {body}{tail}") + + @staticmethod + def _round_style(view: RoundView) -> str: + """Return the Rich style string for the round marker.""" + if view.error_message: + return "bold red" + if view.skip_reason: + return "dim" + if view.accepted: + return "bold green" + return "yellow" + + def _stop_progress(self) -> None: + if self._progress is None: + return + try: + self._progress.stop() + except Exception: # pragma: no cover + pass + self._progress = None + self._budget_task = None + + def run_finished( + self, + result: "OptimizeResult", + *, + output_dir: str, + update_source: bool, + ) -> None: + from rich.panel import Panel + from rich.table import Table + from rich import box + + self._stop_progress() + + accepted = sum(1 for r in result.rounds if r.accepted) + sign = "+" if result.pass_rate_improvement >= 0 else "" + arrow = _improvement_arrow(result.pass_rate_improvement, ascii_only=False) + label = _format_improvement_label(result.pass_rate_improvement) + delta_color = ("green" + if result.pass_rate_improvement > 0 else "red" if result.pass_rate_improvement < 0 else "dim") + + table = Table.grid(padding=(0, 2)) + table.add_column(no_wrap=True, style="dim") + table.add_column(no_wrap=False) + rate_value = (f"{result.baseline_pass_rate:.4f} -> [bold]{result.best_pass_rate:.4f}[/bold] " + f"[{delta_color}]{arrow} {sign}{result.pass_rate_improvement:.4f}[/{delta_color}] " + f"[{delta_color}]({label})[/{delta_color}]") + table.add_row("pass_rate", rate_value) + table.add_row("rounds", f"{accepted} accepted / {result.total_rounds} total") + table.add_row("duration", f"{result.duration_seconds:.2f}s") + stop_text = _format_stop_reason_text(result.stop_reason) + if stop_text is not None: + table.add_row("stopped by", stop_text) + if result.status != "SUCCEEDED" and result.error_message: + table.add_row("error", f"[red]{result.error_message}[/red]") + update_msg = _AsciiReporter._format_update_source_line(result=result, + output_dir=output_dir, + update_source=update_source) + if update_msg: + table.add_row("update_source", update_msg.split(":", 1)[1].strip()) + table.add_row("artifacts", f"{output_dir}/") + for name, content in result.best_prompts.items(): + display = _truncate(name, _FIELD_NAME_DISPLAY_LIMIT) + table.add_row("", f"best_prompts/{display}.md [dim]({len(content)} chars)[/dim]") + table.add_row("", "result.json summary.txt rounds/ run.log") + + title_style = "bold green" if result.status == "SUCCEEDED" else "bold red" + panel = Panel( + table, + title=f"[{title_style}]Optimization complete Ā· {result.status}[/{title_style}]", + box=box.ROUNDED, + padding=(0, 1), + ) + self._console.print("") + self._console.print(panel) + + metric_keys = _ordered_metric_keys( + result.baseline_metric_breakdown, + result.best_metric_breakdown, + extra=list(result.metric_thresholds.keys()), + ) + if metric_keys: + mt = Table( + title="per-metric scores", + box=box.SIMPLE_HEAVY, + show_header=True, + header_style="bold", + title_style="dim", + ) + mt.add_column("metric", no_wrap=True) + mt.add_column("threshold", justify="right") + mt.add_column("baseline", justify="right") + mt.add_column("best", justify="right") + mt.add_column("delta", justify="right") + mt.add_column("status", justify="right") + for name in metric_keys: + base = result.baseline_metric_breakdown.get(name) + best = result.best_metric_breakdown.get(name) + threshold = result.metric_thresholds.get(name) + delta = (best or 0.0) - (base or 0.0) + d_color = ("green" if delta > 0 else "red" if delta < 0 else "dim") + d_arrow, d_text = _format_delta(delta, ascii_only=False) + base_str = _format_score(base) + best_str = _format_score(best) + threshold_str = (f"{threshold:.4f}" if threshold is not None else "-") + status = _baseline_metric_status(best, threshold, ascii_only=False) + status_color = ("green" if status == "PASS" else "red" if status == "FAIL" else "dim") + mt.add_row( + name, + threshold_str, + base_str, + best_str, + f"[{d_color}]{d_arrow} {d_text}[/{d_color}]", + f"[{status_color}]{status}[/{status_color}]", + ) + self._console.print(mt) + + def run_failed( + self, + *, + baseline_prompts: dict[str, str], + output_dir: str, + error_message: str, + ) -> None: + from rich.panel import Panel + from rich import box + + self._stop_progress() + + body = (f"[red]error :[/red] {error_message}\n" + f"output_dir : {output_dir}\n" + f"baseline preserved at {os.path.join(output_dir, 'baseline_prompts')}") + panel = Panel( + body, + title="[bold red]Optimization FAILED[/bold red]", + box=box.ROUNDED, + padding=(0, 1), + ) + self._console.print("") + self._console.print(panel) + + +class _SilentGepaLogger: + """gepa-LoggerProtocol-compatible sink used to suppress library logs. + + With ``verbose<=1`` every message is dropped; with ``verbose>=2`` messages + are forwarded to the ``trpc_agent_sdk.optimizer.gepa`` logger at INFO + level so callers can route them via the standard logging configuration. + """ + + def __init__(self, *, verbose: int) -> None: + self._verbose = verbose + self._target = logging.getLogger(_GEPA_LOGGER_NAME) if verbose >= 2 else None + + def log(self, message: str) -> None: + if self._target is not None: + self._target.info("%s", message) + + +def create_reporter( + *, + verbose: int = 1, + stream: TextIO = sys.stdout, +) -> OptimizeReporter: + """Pick the appropriate reporter backend. + + Resolution order: ``verbose == 0`` returns :class:`_NullReporter`; + otherwise the factory attempts to import ``rich`` and returns + :class:`_RichReporter` on success or :class:`_AsciiReporter` on failure. + Unknown ``verbose`` values are normalised to ``1``. + """ + if verbose == 0: + return _NullReporter() + if verbose not in (1, 2): + verbose = 1 + try: + import rich # noqa: F401 + except ImportError: + return _AsciiReporter(stream=stream, verbose=verbose) + return _RichReporter(stream=stream, verbose=verbose) diff --git a/trpc_agent_sdk/evaluation/_optimize_result.py b/trpc_agent_sdk/evaluation/_optimize_result.py new file mode 100644 index 0000000..28e7928 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_optimize_result.py @@ -0,0 +1,361 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Optimization result data structures.""" + +from __future__ import annotations + +import os +from typing import Any +from typing import Literal +from typing import Optional + +from pydantic import Field + +from ._common import EvalBaseModel + +RunStatus = Literal["SUCCEEDED", "FAILED", "CANCELED"] + +FinishReason = Literal[ + "completed", + "perfect_pass_rate", + "no_improvement", + "error", +] + +StopReason = Literal[ + "required_metrics_passing", + "budget_exhausted", + "no_improvement", + "timeout", + "score_threshold", + "max_candidate_proposals", + "max_tracked_candidates", + "user_requested_stop", + "completed", +] + +RoundKind = Literal["reflective", "merge"] + + +class RoundRecord(EvalBaseModel): + """Per-round optimization record. + + Attributes: + round: 1-based round index. + optimized_field_names: Field names actually rewritten by the optimize model this round. + candidate_prompts: Full candidate map for the round; reused fields carry the previous text. + train_pass_rate: Currently always 0.0; see field description below. + validation_pass_rate: Pass rate on the validation split. + metric_breakdown: Mean score per metric on the validation split. + accepted: True iff the candidate was accepted as new best. + acceptance_reason: Human-readable reason for the acceptance decision. + failed_case_ids: Eval case ids that failed the validation split this round. + failed_cases_truncated: Number of failed cases dropped by token-budget truncation. + per_field_diagnosis: Diagnosis text from the reflection LM, keyed by optimized field name. + reflection_lm_calls: Number of reflection LM invocations this round (including retries). + round_llm_cost: USD cost for this round (reflection LM + evaluator). + round_token_usage: Token usage for this round; keys are "prompt", "completion", "total". + started_at: ISO-8601 timestamp when the round started. + duration_seconds: Wall-clock duration of the round in seconds. + extras: Free-form business payload; the optimizer never reads or modifies it. + """ + + round: int = Field(description="1-based round index.") + optimized_field_names: list[str] = Field(description="Field names rewritten by the optimize model this round.", ) + candidate_prompts: dict[str, str] = Field(description="Full candidate prompt map for the round.", ) + + train_pass_rate: float = Field( + default=0.0, + description=("Currently always 0.0: gepa does not expose a full-train-set pass " + "rate (it only samples minibatches each round). Use " + "train_subsample_parent_score / train_subsample_candidate_score " + "for per-round minibatch metrics instead."), + ) + validation_pass_rate: float = Field(description="Pass rate on the validation split.") + metric_breakdown: dict[str, float] = Field( + default_factory=dict, + description=("Mean score per metric on the validation split. Empty when the " + "round was skipped before valset evaluation, or when the " + "evaluator did not expose per-metric scores."), + ) + + accepted: bool = Field(description="True iff the candidate was accepted as new best.") + acceptance_reason: str = Field(default="", description="Human-readable acceptance reason.") + + failed_case_ids: list[str] = Field( + default_factory=list, + description="Eval case ids that failed validation this round.", + ) + failed_cases_truncated: int = Field( + default=0, + description="Number of failed cases dropped by token-budget truncation.", + ) + per_field_diagnosis: dict[str, str] = Field( + default_factory=dict, + description="Diagnosis text from the reflection LM, keyed by optimized field name.", + ) + reflection_lm_calls: int = Field( + default=0, + description="Number of reflection LM invocations this round (including retries).", + ) + + round_llm_cost: float = Field( + default=0.0, + description="USD cost for this round (reflection LM + evaluator).", + ) + round_token_usage: dict[str, int] = Field( + default_factory=lambda: { + "prompt": 0, + "completion": 0, + "total": 0 + }, + description='Token usage for this round; keys are "prompt", "completion", "total".', + ) + + started_at: str = Field(description="ISO-8601 timestamp when the round started.") + duration_seconds: float = Field(description="Wall-clock duration of the round in seconds.") + + kind: RoundKind = Field( + default="reflective", + description=("Mutation kind for this round: 'reflective' for the standard " + "reflective proposal step and 'merge' for system-aware merges."), + ) + train_minibatch_size: int = Field( + default=0, + description=("Cases sampled from the training set this round. 0 when the round " + "skipped before sampling (e.g. 'no proposal')."), + ) + train_subsample_parent_score: Optional[float] = Field( + default=None, + description=("Parent candidate's score on the sampled minibatch; None when no " + "subsample was produced."), + ) + train_subsample_candidate_score: Optional[float] = Field( + default=None, + description=("New candidate's score on the sampled minibatch; None when no " + "candidate was evaluated."), + ) + skip_reason: Optional[str] = Field( + default=None, + description=("Human-readable reason set on skipped rounds (e.g. " + "'subsample perfect', 'no proposal'). None when the round ran " + "normally or ended in an error."), + ) + error_message: Optional[str] = Field( + default=None, + description="Error message when the round ended in an algorithm error.", + ) + budget_used: Optional[int] = Field( + default=None, + description=("Cumulative metric calls consumed across all rounds so far. None " + "when the algorithm does not track a budget."), + ) + budget_total: Optional[int] = Field( + default=None, + description="Configured budget cap (e.g. max_metric_calls); None means 'auto'.", + ) + + extras: dict[str, Any] = Field( + default_factory=dict, + description="Free-form business payload; optimizer ignores it.", + ) + + +class OptimizeResult(EvalBaseModel): + """Top-level optimization result. + + Attributes: + schema_version: Result schema version; bumped on breaking layout changes. + algorithm: Algorithm name that produced this result. + status: Final run status. + finish_reason: Why the loop stopped. + error_message: Error message when status is FAILED. + baseline_pass_rate: Validation pass rate of the baseline prompts. + best_pass_rate: Validation pass rate of the best prompts. + pass_rate_improvement: best_pass_rate minus baseline_pass_rate. + baseline_metric_breakdown: Mean score per metric for the baseline. + best_metric_breakdown: Mean score per metric for the best prompts. + baseline_prompts: Initial prompt text keyed by TargetPrompt name. + best_prompts: Best prompt text keyed by TargetPrompt name. + total_rounds: Number of rounds executed. + rounds: Per-round records in order. + total_reflection_lm_calls: Total reflection LM invocations (including retries). + total_judge_model_calls: Currently always 0; see field description below. + total_llm_cost: USD cost across the whole run (reflection LM + evaluator). + total_token_usage: Token usage across the whole run; keys are "prompt", "completion", "total". + duration_seconds: Wall-clock duration of the whole run in seconds. + started_at: ISO-8601 timestamp when the run started. + finished_at: ISO-8601 timestamp when the run finished. + extras: Free-form business payload; the optimizer never reads or modifies it. + """ + + schema_version: str = Field(default="v1", description="Result schema version.") + algorithm: str = Field(description=("Algorithm name that produced this result; matches the registered key in " + "OPTIMIZER_REGISTRY (e.g. 'gepa_reflective')."), ) + + status: RunStatus = Field(description="Final run status.") + finish_reason: FinishReason = Field(description="Why the loop stopped.") + stop_reason: Optional[StopReason] = Field( + default=None, + description=("Which stop policy ended the run: 'required_metrics_passing' when " + "the framework's per-metric threshold policy fired; " + "'budget_exhausted' on MaxMetricCallsStopper; 'no_improvement' on " + "NoImprovementStopper; 'timeout' on TimeoutStopCondition; " + "'score_threshold' on ScoreThresholdStopper; " + "'max_candidate_proposals' / 'max_tracked_candidates' on the " + "respective candidate caps; 'completed' when the GEPA loop ended " + "without any registered stopper firing. None on FAILED runs that " + "errored before any stopper ran."), + ) + error_message: str = Field(default="", description="Error message when status is FAILED.") + + baseline_pass_rate: float = Field(description="Baseline validation pass rate.") + best_pass_rate: float = Field(description="Best validation pass rate.") + pass_rate_improvement: float = Field(description="best_pass_rate minus baseline_pass_rate.") + + baseline_metric_breakdown: dict[str, float] = Field( + default_factory=dict, + description="Mean score per metric for the baseline.", + ) + best_metric_breakdown: dict[str, float] = Field( + default_factory=dict, + description="Mean score per metric for the best prompts.", + ) + metric_thresholds: dict[str, float] = Field( + default_factory=dict, + description=("PASS/FAIL threshold per metric, copied from evaluate.metrics[].threshold. " + "Lets reporters and summary.txt show baseline / best scores alongside " + "the per-metric threshold so users can see at a glance whether a metric " + "is now above or below its acceptance bar."), + ) + + per_metric_best_candidates: dict[str, list[int]] = Field( + default_factory=dict, + description=("Per-metric Pareto-best candidate indices reported by GEPA. Keyed by " + "metric name; the list contains 0-based indices into the candidate " + "trajectory. Empty when the underlying algorithm does not expose " + "per-objective fronts. Useful for diagnosing which candidate excels " + "on which metric independent of the aggregated best."), + ) + + baseline_prompts: dict[str, str] = Field( + default_factory=dict, + description="Initial prompt text keyed by TargetPrompt name.", + ) + best_prompts: dict[str, str] = Field( + default_factory=dict, + description="Best prompt text keyed by TargetPrompt name.", + ) + + total_rounds: int = Field(description="Number of rounds executed.") + rounds: list[RoundRecord] = Field( + default_factory=list, + description="Per-round records in order.", + ) + + total_reflection_lm_calls: int = Field(description="Total reflection LM invocations (including retries).", ) + total_judge_model_calls: int = Field( + default=0, + description=("Currently always 0: the evaluator does not surface per-judge " + "invocation counts. Reflection LM cost is reflected in " + "total_reflection_lm_calls / total_llm_cost; for judge cost use " + "your LLM provider's billing dashboard."), + ) + total_llm_cost: float = Field( + default=0.0, + description="USD cost across the whole run.", + ) + total_token_usage: dict[str, int] = Field( + default_factory=lambda: { + "prompt": 0, + "completion": 0, + "total": 0 + }, + description='Token usage across the whole run; keys are "prompt", "completion", "total".', + ) + + duration_seconds: float = Field(description="Wall-clock duration of the run in seconds.") + started_at: str = Field(description="ISO-8601 timestamp when the run started.") + finished_at: str = Field(description="ISO-8601 timestamp when the run finished.") + + extras: dict[str, Any] = Field( + default_factory=dict, + description="Free-form business payload; optimizer ignores it.", + ) + + def dump_to(self, path: str) -> None: + """Serialize the result to a JSON file using model_dump_json(indent=2).""" + payload = self.model_dump_json(indent=2, by_alias=True) + with open(path, "w", encoding="utf-8") as fp: + fp.write(payload) + + @classmethod + def from_file(cls, path: str) -> "OptimizeResult": + """Load an OptimizeResult previously written by dump_to.""" + with open(path, "r", encoding="utf-8") as fp: + payload = fp.read() + return cls.model_validate_json(payload) + + def format_summary(self, *, output_dir: str, update_source: bool) -> str: + """Render the human-readable text summary persisted as ``summary.txt``. + + The layout mirrors the terminal summary so users can copy paste any + line directly. Algorithm name, status, baseline / best pass rates, + delta, rounds, duration, error message (when present), best prompt + inventory and the output directory are always included. + """ + sign = "+" if self.pass_rate_improvement >= 0 else "" + if self.pass_rate_improvement > 0: + label = "improved" + elif self.pass_rate_improvement < 0: + label = "regressed" + else: + label = "no improvement" + accepted = sum(1 for r in self.rounds if r.accepted) + lines: list[str] = [ + f"Optimization complete | status={self.status} | algorithm={self.algorithm}", + "", + f"pass_rate : {self.baseline_pass_rate:.4f} -> {self.best_pass_rate:.4f}" + f" ({sign}{self.pass_rate_improvement:.4f}, {label})", + f"rounds : {accepted} accepted / {self.total_rounds} total", + f"duration : {self.duration_seconds:.2f}s", + f"started_at : {self.started_at}", + f"finished_at : {self.finished_at}", + ] + if self.status != "SUCCEEDED" and self.error_message: + lines.append(f"error_message : {self.error_message}") + if self.stop_reason is not None: + lines.append(f"stop_reason : {self.stop_reason}") + lines.append(f"update_source : {'true' if update_source else 'false'}") + lines.append(f"output_dir : {output_dir}") + if (self.baseline_metric_breakdown or self.best_metric_breakdown or self.metric_thresholds): + lines.append("") + lines.append("metric breakdown (threshold | baseline -> best):") + keys = sorted({ + *self.baseline_metric_breakdown.keys(), + *self.best_metric_breakdown.keys(), + *self.metric_thresholds.keys(), + }) + for name in keys: + b = self.baseline_metric_breakdown.get(name, float("nan")) + t = self.best_metric_breakdown.get(name, float("nan")) + if name in self.metric_thresholds: + threshold_str = f"{self.metric_thresholds[name]:.4f}" + else: + threshold_str = " - " + lines.append(f" - {name:<40s} threshold {threshold_str} " + f"{b:.4f} -> {t:.4f}") + if self.best_prompts: + lines.append("") + lines.append("best prompts:") + for name, content in self.best_prompts.items(): + rel = os.path.join("best_prompts", f"{name}.md") + lines.append(f" - {name:<40s} {len(content)} chars ({rel})") + lines.append("") + lines.append(f"artifacts directory: {output_dir}") + lines.append(" result.json summary.txt rounds/ run.log " + "baseline_prompts/ best_prompts/ config.snapshot.json") + return "\n".join(lines) + "\n" diff --git a/trpc_agent_sdk/evaluation/_remote_eval_service.py b/trpc_agent_sdk/evaluation/_remote_eval_service.py index 25a4319..edde246 100644 --- a/trpc_agent_sdk/evaluation/_remote_eval_service.py +++ b/trpc_agent_sdk/evaluation/_remote_eval_service.py @@ -48,8 +48,17 @@ from ._evaluator_registry import EvaluatorRegistry CallAgent = Callable[[str], Awaitable[str]] +# Metrics that cannot run under RemoteEvalService (black-box / call_agent +# mode) because they need information this service does not capture: +# - ``tool_trajectory_avg_score`` needs per-step tool call traces. +# - ``llm_rubric_knowledge_recall`` reads tool responses from +# ``Invocation.intermediate_data``; this service always emits +# ``intermediate_data=None`` (see ``_perform_inference_single_eval_item``), +# so the judge would silently see "No knowledge search results were +# found." for every case. REMOTE_EVAL_INCOMPATIBLE_METRICS: frozenset[str] = frozenset({ "tool_trajectory_avg_score", + "llm_rubric_knowledge_recall", }) EVAL_SESSION_ID_PREFIX = "___remote_eval___session___" diff --git a/trpc_agent_sdk/evaluation/_target_prompt.py b/trpc_agent_sdk/evaluation/_target_prompt.py new file mode 100644 index 0000000..da104f1 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_target_prompt.py @@ -0,0 +1,243 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Multi-field prompt registry with atomic write_all for AgentOptimizer.""" + +from __future__ import annotations + +import inspect +import os +from pathlib import Path +from typing import Awaitable +from typing import Callable +from typing import Optional + +AsyncRead = Callable[[], Awaitable[str]] +AsyncWrite = Callable[[str], Awaitable[None]] + + +class _RollbackError(RuntimeError): + """Aggregate error raised when one or more path-field rollbacks fail. + + Carries ``(field_name, error)`` pairs for every field whose rollback + raised. The original ``write_all`` failure is preserved as + ``__context__`` (via ``raise ... from primary_err``) so chained + tracebacks surface both the root cause and every rollback failure. + + Private (underscore-prefixed) — users only observe it through the + formatted message in tracebacks; never declared in the public API. + """ + + def __init__(self, failures: list[tuple[str, BaseException]]) -> None: + self.failures = failures + details = "; ".join(f"{name}: {type(err).__name__}: {err}" for name, err in failures) + super().__init__(f"TargetPrompt.write_all rollback failed for " + f"{len(failures)} field(s): {details}") + + +class _Source: + """Base for a single registered prompt source.""" + + +class _PathSource(_Source): + """File-backed prompt source: read/write a UTF-8 text file at a fixed path.""" + + __slots__ = ("path", ) + + def __init__(self, path: str) -> None: + self.path = path + + +class _CallbackSource(_Source): + """Callback-backed prompt source: caller-provided async read/write functions.""" + + __slots__ = ("read_fn", "write_fn") + + def __init__(self, read_fn: AsyncRead, write_fn: AsyncWrite) -> None: + self.read_fn = read_fn + self.write_fn = write_fn + + +class TargetPrompt: + """Registry of prompt fields to be optimized by AgentOptimizer. + + Each field is registered with a unique name and one of two source forms: + - add_path(name, path): file-backed source; framework reads/writes the file + - add_callback(name, read=, write=): caller-backed source with async functions + + Typical use: + target = ( + TargetPrompt() + .add_path("system_prompt", "my_pkg/system.md") + .add_callback("retriever", read=load_fn, write=save_fn) + ) + + read_all / write_all operate on every registered field. write_all is atomic + for path-backed fields (tmp file + os.replace, rollback on partial failure); + callback-backed atomicity is the caller's responsibility. + """ + + def __init__(self) -> None: + self._sources: dict[str, _Source] = {} + + def add_path(self, name: str, path: str) -> "TargetPrompt": + """Register a file-backed prompt field. name must be unique.""" + self._reject_duplicate(name) + self._sources[name] = _PathSource(path) + return self + + def add_callback( + self, + name: str, + *, + read: AsyncRead, + write: AsyncWrite, + ) -> "TargetPrompt": + """Register a callback-backed prompt field with async read / write functions.""" + self._reject_duplicate(name) + if not inspect.iscoroutinefunction(read): + raise TypeError(f"add_callback {name!r}: read must be an async function") + if not inspect.iscoroutinefunction(write): + raise TypeError(f"add_callback {name!r}: write must be an async function") + self._sources[name] = _CallbackSource(read, write) + return self + + def names(self) -> list[str]: + """Return registered field names in insertion order.""" + return list(self._sources.keys()) + + def describe_source(self, name: str) -> str: + """Human-readable source label for a field. + + Path-backed fields return the file path verbatim; callback-backed + fields return the literal ``""``. Raises KeyError if name + is unknown. Used by the optimizer reporter header. + """ + src = self._sources[name] + if isinstance(src, _PathSource): + return src.path + return "" + + async def read(self, name: str) -> str: + """Read the value of a single registered field. Raises KeyError if name is unknown.""" + src = self._sources[name] + return await self._read_one(src) + + async def read_all(self) -> dict[str, str]: + """Read every registered field. Propagates underlying errors (FileNotFoundError / callback exceptions).""" + out: dict[str, str] = {} + for name, src in self._sources.items(): + out[name] = await self._read_one(src) + return out + + async def write_all(self, prompts: dict[str, str]) -> None: + """Atomically write all registered fields. Keys must exactly match registered names. + + Atomicity contract: + - Path fields: write to {path}.tmp, then os.replace (single-file POSIX-atomic rename). + - On any path write failure: already-renamed paths are rolled back to pre-call content, + residual .tmp files are removed, and the original exception propagates. Rollback uses + the same tmp + os.replace primitive, so an interrupted rollback cannot leave a path + field half-written. + - If rollback of any field also fails, the original exception is preserved on + ``__context__`` and a single ``_RollbackError`` listing every per-field rollback + failure propagates. Rollback is best-effort: a failure on one field does not skip + the remaining fields. + - Callback fields: invoked sequentially after every path write succeeds. A callback + failure rolls back path fields to the pre-call snapshot before propagating; callback + fields themselves are not rolled back (caller-owned idempotency). + """ + if set(prompts.keys()) != set(self._sources.keys()): + raise ValueError(f"TargetPrompt.write_all: prompts keys mismatch; " + f"expected {sorted(self._sources.keys())}, got {sorted(prompts.keys())}") + + path_backups = self._snapshot_path_contents() + written_paths: list[str] = [] + try: + for name, src in self._sources.items(): + if isinstance(src, _PathSource): + self._atomic_write_path(src.path, prompts[name]) + written_paths.append(name) + for name, src in self._sources.items(): + if isinstance(src, _CallbackSource): + await src.write_fn(prompts[name]) + except BaseException as primary_err: + rollback_failures = self._rollback_paths(written_paths, path_backups) + self._cleanup_tmp_files() + if rollback_failures: + raise _RollbackError(rollback_failures) from primary_err + raise + + def _reject_duplicate(self, name: str) -> None: + if name in self._sources: + raise ValueError(f"TargetPrompt: name {name!r} already registered") + + def _snapshot_path_contents(self) -> dict[str, Optional[str]]: + """Capture pre-call content of every path-backed field (None if source did not exist).""" + snapshot: dict[str, Optional[str]] = {} + for name, src in self._sources.items(): + if isinstance(src, _PathSource): + try: + snapshot[name] = Path(src.path).read_text(encoding="utf-8") + except FileNotFoundError: + snapshot[name] = None + return snapshot + + def _rollback_paths( + self, + written: list[str], + backups: dict[str, Optional[str]], + ) -> list[tuple[str, BaseException]]: + """Best-effort atomic rollback of every successfully written path field. + + For each field in ``written`` whose source did not exist before + write_all (``backups[name] is None``) the file is unlinked; for + fields that had pre-call content the content is restored via + ``_atomic_write_path`` (tmp + os.replace), so an interrupted + rollback cannot leave a path field half-written. + + Failures are collected and returned rather than raised, so a + single field's failure does not skip subsequent fields. The + caller wraps the collected failures into ``_RollbackError``. + """ + failures: list[tuple[str, BaseException]] = [] + for name in written: + src = self._sources[name] + if not isinstance(src, _PathSource): + continue + backup = backups.get(name) + try: + if backup is None: + try: + os.unlink(src.path) + except FileNotFoundError: + pass + else: + self._atomic_write_path(src.path, backup) + except BaseException as err: + failures.append((name, err)) + return failures + + def _cleanup_tmp_files(self) -> None: + for src in self._sources.values(): + if isinstance(src, _PathSource): + tmp = src.path + ".tmp" + try: + os.unlink(tmp) + except FileNotFoundError: + pass + + @staticmethod + def _atomic_write_path(path: str, content: str) -> None: + tmp = path + ".tmp" + Path(tmp).write_text(content, encoding="utf-8") + os.replace(tmp, path) + + async def _read_one(self, src: _Source) -> str: + if isinstance(src, _PathSource): + return Path(src.path).read_text(encoding="utf-8") + if isinstance(src, _CallbackSource): + return await src.read_fn() + raise TypeError(f"unknown source type: {type(src).__name__}") From 344d3fd53c957612a15976b32ac54ac1444b4122 Mon Sep 17 00:00:00 2001 From: Rook1ex <1127097451@qq.com> Date: Thu, 21 May 2026 17:23:05 +0800 Subject: [PATCH 2/7] =?UTF-8?q?feat:=20=E8=A1=A5=E5=85=85=E4=B9=8B?= =?UTF-8?q?=E5=89=8D=E5=9B=A0=E6=9C=AA=E5=BC=80=E6=BA=90=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E7=9A=84=20evaluation=20=E4=BE=8B=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/evaluation/callbacks/.env | 1 + examples/evaluation/callbacks/README.md | 22 +++ .../evaluation/callbacks/agent/__init__.py | 6 + examples/evaluation/callbacks/agent/agent.py | 36 ++++ .../agent/callbacks_example.evalset.json | 37 ++++ examples/evaluation/callbacks/agent/config.py | 21 +++ .../callbacks/agent/test_config.json | 27 +++ .../evaluation/callbacks/test_callbacks.py | 116 +++++++++++++ examples/evaluation/context_messages/.env | 1 + .../evaluation/context_messages/README.md | 20 +++ .../context_messages/agent/__init__.py | 6 + .../context_messages/agent/agent.py | 36 ++++ .../context_messages/agent/config.py | 21 +++ .../agent/context_example.evalset.json | 47 ++++++ .../context_messages/agent/test_config.json | 27 +++ .../context_messages/test_context_messages.py | 24 +++ examples/evaluation/custom_runner/.env | 1 + examples/evaluation/custom_runner/README.md | 39 +++++ .../custom_runner/agent/__init__.py | 6 + .../evaluation/custom_runner/agent/agent.py | 36 ++++ .../evaluation/custom_runner/agent/config.py | 21 +++ .../agent/custom_runner_example.evalset.json | 37 ++++ .../custom_runner/agent/test_config.json | 27 +++ .../custom_runner/test_custom_runner.py | 41 +++++ examples/evaluation/llm_final_response/.env | 1 + .../evaluation/llm_final_response/README.md | 23 +++ .../llm_final_response/agent/__init__.py | 6 + .../llm_final_response/agent/agent.py | 47 ++++++ .../llm_final_response/agent/config.py | 22 +++ .../agent/llm_final_response.evalset.json | 31 ++++ .../llm_final_response/agent/test_config.json | 19 +++ .../test_llm_final_response.py | 24 +++ examples/evaluation/llm_judge_tools/.env | 1 + examples/evaluation/llm_judge_tools/README.md | 63 +++++++ .../llm_judge_tools/agent/__init__.py | 6 + .../evaluation/llm_judge_tools/agent/agent.py | 47 ++++++ .../llm_judge_tools/agent/config.py | 22 +++ .../agent/judge_tools.evalset.json | 31 ++++ .../llm_judge_tools/agent/test_config.json | 53 ++++++ .../llm_judge_tools/test_llm_judge_tools.py | 40 +++++ .../llm_rubric_knowledge_recall/.env | 1 + .../llm_rubric_knowledge_recall/README.md | 23 +++ .../agent/__init__.py | 6 + .../agent/agent.py | 48 ++++++ .../agent/config.py | 22 +++ .../llm_rubric_knowledge_recall.evalset.json | 33 ++++ .../agent/test_config.json | 32 ++++ .../test_llm_rubric_knowledge_recall.py | 24 +++ examples/evaluation/llm_rubric_response/.env | 1 + .../evaluation/llm_rubric_response/README.md | 23 +++ .../llm_rubric_response/agent/__init__.py | 6 + .../llm_rubric_response/agent/agent.py | 47 ++++++ .../llm_rubric_response/agent/config.py | 22 +++ .../agent/llm_rubric_response.evalset.json | 31 ++++ .../agent/test_config.json | 31 ++++ .../test_llm_rubric_response.py | 24 +++ examples/evaluation/pass_at_k/.env | 1 + examples/evaluation/pass_at_k/README.md | 22 +++ .../evaluation/pass_at_k/agent/__init__.py | 6 + examples/evaluation/pass_at_k/agent/agent.py | 45 +++++ examples/evaluation/pass_at_k/agent/config.py | 22 +++ .../pass_at_k/agent/test_config.json | 28 +++ .../agent/weather_agent.evalset.json | 37 ++++ .../evaluation/pass_at_k/test_pass_at_k.py | 38 +++++ examples/evaluation/quickstart/.env | 1 + examples/evaluation/quickstart/README.md | 27 +++ .../evaluation/quickstart/agent/__init__.py | 6 + examples/evaluation/quickstart/agent/agent.py | 78 +++++++++ .../evaluation/quickstart/agent/config.py | 22 +++ .../quickstart/agent/test_config.json | 27 +++ .../agent/weather_agent.evalset.json | 47 ++++++ .../evaluation/quickstart/test_quickstart.py | 22 +++ examples/evaluation/trace_mode/.env | 1 + examples/evaluation/trace_mode/README.md | 20 +++ .../evaluation/trace_mode/agent/__init__.py | 6 + examples/evaluation/trace_mode/agent/agent.py | 31 ++++ .../evaluation/trace_mode/agent/config.py | 21 +++ .../trace_mode/agent/test_config.json | 27 +++ .../agent/trace_example.evalset.json | 55 ++++++ .../evaluation/trace_mode/test_trace_mode.py | 22 +++ examples/evaluation/webui/.env | 1 + examples/evaluation/webui/README.md | 36 ++++ examples/evaluation/webui/agent/__init__.py | 6 + .../evaluation/webui/agent/agent.evalset.json | 101 +++++++++++ examples/evaluation/webui/agent/agent.py | 42 +++++ examples/evaluation/webui/agent/config.py | 24 +++ examples/evaluation/webui/agent/prompts.py | 40 +++++ .../evaluation/webui/agent/test_config.json | 6 + examples/evaluation/webui/agent/tools.py | 159 ++++++++++++++++++ examples/evaluation/webui/test_book_finder.py | 21 +++ 90 files changed, 2514 insertions(+) create mode 100644 examples/evaluation/callbacks/.env create mode 100644 examples/evaluation/callbacks/README.md create mode 100644 examples/evaluation/callbacks/agent/__init__.py create mode 100644 examples/evaluation/callbacks/agent/agent.py create mode 100644 examples/evaluation/callbacks/agent/callbacks_example.evalset.json create mode 100644 examples/evaluation/callbacks/agent/config.py create mode 100644 examples/evaluation/callbacks/agent/test_config.json create mode 100644 examples/evaluation/callbacks/test_callbacks.py create mode 100644 examples/evaluation/context_messages/.env create mode 100644 examples/evaluation/context_messages/README.md create mode 100644 examples/evaluation/context_messages/agent/__init__.py create mode 100644 examples/evaluation/context_messages/agent/agent.py create mode 100644 examples/evaluation/context_messages/agent/config.py create mode 100644 examples/evaluation/context_messages/agent/context_example.evalset.json create mode 100644 examples/evaluation/context_messages/agent/test_config.json create mode 100644 examples/evaluation/context_messages/test_context_messages.py create mode 100644 examples/evaluation/custom_runner/.env create mode 100644 examples/evaluation/custom_runner/README.md create mode 100644 examples/evaluation/custom_runner/agent/__init__.py create mode 100644 examples/evaluation/custom_runner/agent/agent.py create mode 100644 examples/evaluation/custom_runner/agent/config.py create mode 100644 examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json create mode 100644 examples/evaluation/custom_runner/agent/test_config.json create mode 100644 examples/evaluation/custom_runner/test_custom_runner.py create mode 100644 examples/evaluation/llm_final_response/.env create mode 100644 examples/evaluation/llm_final_response/README.md create mode 100644 examples/evaluation/llm_final_response/agent/__init__.py create mode 100644 examples/evaluation/llm_final_response/agent/agent.py create mode 100644 examples/evaluation/llm_final_response/agent/config.py create mode 100644 examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json create mode 100644 examples/evaluation/llm_final_response/agent/test_config.json create mode 100644 examples/evaluation/llm_final_response/test_llm_final_response.py create mode 100644 examples/evaluation/llm_judge_tools/.env create mode 100644 examples/evaluation/llm_judge_tools/README.md create mode 100644 examples/evaluation/llm_judge_tools/agent/__init__.py create mode 100644 examples/evaluation/llm_judge_tools/agent/agent.py create mode 100644 examples/evaluation/llm_judge_tools/agent/config.py create mode 100644 examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json create mode 100644 examples/evaluation/llm_judge_tools/agent/test_config.json create mode 100644 examples/evaluation/llm_judge_tools/test_llm_judge_tools.py create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/.env create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/README.md create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/agent/config.py create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json create mode 100644 examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py create mode 100644 examples/evaluation/llm_rubric_response/.env create mode 100644 examples/evaluation/llm_rubric_response/README.md create mode 100644 examples/evaluation/llm_rubric_response/agent/__init__.py create mode 100644 examples/evaluation/llm_rubric_response/agent/agent.py create mode 100644 examples/evaluation/llm_rubric_response/agent/config.py create mode 100644 examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json create mode 100644 examples/evaluation/llm_rubric_response/agent/test_config.json create mode 100644 examples/evaluation/llm_rubric_response/test_llm_rubric_response.py create mode 100644 examples/evaluation/pass_at_k/.env create mode 100644 examples/evaluation/pass_at_k/README.md create mode 100644 examples/evaluation/pass_at_k/agent/__init__.py create mode 100644 examples/evaluation/pass_at_k/agent/agent.py create mode 100644 examples/evaluation/pass_at_k/agent/config.py create mode 100644 examples/evaluation/pass_at_k/agent/test_config.json create mode 100644 examples/evaluation/pass_at_k/agent/weather_agent.evalset.json create mode 100644 examples/evaluation/pass_at_k/test_pass_at_k.py create mode 100644 examples/evaluation/quickstart/.env create mode 100644 examples/evaluation/quickstart/README.md create mode 100644 examples/evaluation/quickstart/agent/__init__.py create mode 100644 examples/evaluation/quickstart/agent/agent.py create mode 100644 examples/evaluation/quickstart/agent/config.py create mode 100644 examples/evaluation/quickstart/agent/test_config.json create mode 100644 examples/evaluation/quickstart/agent/weather_agent.evalset.json create mode 100644 examples/evaluation/quickstart/test_quickstart.py create mode 100644 examples/evaluation/trace_mode/.env create mode 100644 examples/evaluation/trace_mode/README.md create mode 100644 examples/evaluation/trace_mode/agent/__init__.py create mode 100644 examples/evaluation/trace_mode/agent/agent.py create mode 100644 examples/evaluation/trace_mode/agent/config.py create mode 100644 examples/evaluation/trace_mode/agent/test_config.json create mode 100644 examples/evaluation/trace_mode/agent/trace_example.evalset.json create mode 100644 examples/evaluation/trace_mode/test_trace_mode.py create mode 100644 examples/evaluation/webui/.env create mode 100644 examples/evaluation/webui/README.md create mode 100644 examples/evaluation/webui/agent/__init__.py create mode 100644 examples/evaluation/webui/agent/agent.evalset.json create mode 100644 examples/evaluation/webui/agent/agent.py create mode 100644 examples/evaluation/webui/agent/config.py create mode 100644 examples/evaluation/webui/agent/prompts.py create mode 100644 examples/evaluation/webui/agent/test_config.json create mode 100644 examples/evaluation/webui/agent/tools.py create mode 100644 examples/evaluation/webui/test_book_finder.py diff --git a/examples/evaluation/callbacks/.env b/examples/evaluation/callbacks/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/callbacks/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/callbacks/README.md b/examples/evaluation/callbacks/README.md new file mode 100644 index 0000000..91d725c --- /dev/null +++ b/examples/evaluation/callbacks/README.md @@ -0,0 +1,22 @@ +# å›žč°ƒ Callbacks 示例 + +åœØčÆ„ęµ‹äø­ę³Øå†Œ **Callbacks**ļ¼šåœØęŽØē†é›†/ē”Øä¾‹ęŽØē†ć€ę‰“åˆ†é›†/ē”Øä¾‹ę‰“åˆ†ēš„ 8 äøŖē”Ÿå‘½å‘ØęœŸčŠ‚ē‚¹ęŒ‚č½½é’©å­ļ¼Œē”ØäŗŽę‰“ē‚¹ć€ę—„åæ—ć€é‡‡ę ·ęˆ–äøŠęŠ„ć€‚ + +## ē›®å½•ē»“ęž„ + +- `callbacks/`ļ¼šē¤ŗä¾‹ę ¹ē›®å½• +- `agent/`ļ¼šå†…å« `agent.py`态`callbacks_example.evalset.json`态`test_config.json`态`config.py` +- `test_callbacks.py`ļ¼šč°ƒē”Ø `AgentEvaluator.evaluate(..., callbacks=callbacks)`ļ¼Œę³Øå†Œ `before_inference_set`态`after_inference_case`态`before_evaluate_set`态`after_evaluate_case` 并打旄志 + +## ēŽÆå¢ƒč¦ę±‚ + +Python 3.10+ć€‚éœ€é…ē½® `TRPC_AGENT_API_KEY` ē­‰ēŽÆå¢ƒå˜é‡ļ¼ˆåŒ quickstart)。 + +## 运蔌 + +```bash +cd examples/evaluation/callbacks +pytest test_callbacks.py -v --tb=short -s +``` + +`-s` åÆēœ‹åˆ°å›žč°ƒäø­ēš„ print 输出。 diff --git a/examples/evaluation/callbacks/agent/__init__.py b/examples/evaluation/callbacks/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/callbacks/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/callbacks/agent/agent.py b/examples/evaluation/callbacks/agent/agent.py new file mode 100644 index 0000000..5bff805 --- /dev/null +++ b/examples/evaluation/callbacks/agent/agent.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for callbacks example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + data = { + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + } + result = data.get(city, {"temperature": 20, "condition": "未矄"}) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="å¤©ę°”ęŸ„čÆ¢åŠ©ę‰‹", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="ä½ ę˜Æå¤©ę°”åŠ©ę‰‹ļ¼Œē”Ø get_weather ęŸ„čÆ¢åŸŽåø‚å¤©ę°”å¹¶ē®€č¦å›žē­”ć€‚", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/callbacks/agent/callbacks_example.evalset.json b/examples/evaluation/callbacks/agent/callbacks_example.evalset.json new file mode 100644 index 0000000..47515f6 --- /dev/null +++ b/examples/evaluation/callbacks/agent/callbacks_example.evalset.json @@ -0,0 +1,37 @@ +{ + "eval_set_id": "callbacks_example", + "name": "å›žč°ƒē¤ŗä¾‹", + "description": "ę¼”ē¤ŗåœØčÆ„ęµ‹äø­ę³Øå†Œ Callbacksļ¼ŒåœØęŽØē†/ę‰“åˆ†ē”Ÿå‘½å‘ØęœŸę‰“ę—„åæ—ęˆ–č®”ę•°", + "eval_cases": [ + { + "eval_id": "callbacks_001", + "conversation": [ + { + "invocation_id": "e-1", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/callbacks/agent/config.py b/examples/evaluation/callbacks/agent/config.py new file mode 100644 index 0000000..428fb8d --- /dev/null +++ b/examples/evaluation/callbacks/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/callbacks/agent/test_config.json b/examples/evaluation/callbacks/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/callbacks/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/callbacks/test_callbacks.py b/examples/evaluation/callbacks/test_callbacks.py new file mode 100644 index 0000000..8a026e8 --- /dev/null +++ b/examples/evaluation/callbacks/test_callbacks.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""å›žč°ƒē¤ŗä¾‹ļ¼šåœØčÆ„ęµ‹äø­ę³Øå†Œ 8 äøŖē”Ÿå‘½å‘ØęœŸ Callbacksļ¼Œę‰“ę—„åæ—å¹¶ę¼”ē¤ŗ context 传递。""" + +import os +import pytest +from trpc_agent_sdk.evaluation import ( + AgentEvaluator, + Callbacks, + Callback, + CallbackResult, + BeforeInferenceSetArgs, + AfterInferenceSetArgs, + BeforeInferenceCaseArgs, + AfterInferenceCaseArgs, + BeforeEvaluateSetArgs, + AfterEvaluateSetArgs, + BeforeEvaluateCaseArgs, + AfterEvaluateCaseArgs, +) + +triggered: list[str] = [] + + +def before_inference_set(ctx, args: BeforeInferenceSetArgs): + triggered.append("before_inference_set") + print("[callback] ęŽØē†é›†å¼€å§‹", args.request.eval_set_id, flush=True) + return None + + +def after_inference_set(ctx, args: AfterInferenceSetArgs): + triggered.append("after_inference_set") + n = len(args.results) if args.results else 0 + print("[callback] ęŽØē†é›†ē»“ęŸļ¼Œå…±", n, "个用例", flush=True) + return None + + +def before_inference_case(ctx, args: BeforeInferenceCaseArgs): + triggered.append("before_inference_case") + print("[callback] ē”Øä¾‹ęŽØē†å¼€å§‹", args.eval_case_id, flush=True) + return None + + +def after_inference_case(ctx, args: AfterInferenceCaseArgs): + triggered.append("after_inference_case") + print("[callback] ē”Øä¾‹ęŽØē†ē»“ęŸ", args.result.eval_case_id, flush=True) + return None + + +def before_evaluate_set(ctx, args: BeforeEvaluateSetArgs): + triggered.append("before_evaluate_set") + n = len(args.request.inference_results) + print("[callback] ę‰“åˆ†é›†å¼€å§‹ cases=", n, flush=True) + return CallbackResult(context={"phase": "evaluate"}) + + +def after_evaluate_set(ctx, args: AfterEvaluateSetArgs): + triggered.append("after_evaluate_set") + n = len(args.result.eval_case_results) if args.result else 0 + phase = (ctx.get("context") or {}).get("phase", "?") + print("[callback] ę‰“åˆ†é›†ē»“ęŸļ¼Œå…±", n, "äøŖē”Øä¾‹ļ¼Œctx.phase=", phase, flush=True) + return None + + +def before_evaluate_case(ctx, args: BeforeEvaluateCaseArgs): + triggered.append("before_evaluate_case") + print("[callback] ē”Øä¾‹ę‰“åˆ†å¼€å§‹", args.eval_case_id, flush=True) + return None + + +def after_evaluate_case(ctx, args: AfterEvaluateCaseArgs): + triggered.append("after_evaluate_case") + print("[callback] ē”Øä¾‹ę‰“åˆ†ē»“ęŸ", args.result.eval_id, flush=True) + return None + + +@pytest.mark.asyncio +async def test_with_callbacks(): + triggered.clear() + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "callbacks_example.evalset.json") + + callbacks = Callbacks() + callbacks.register( + "demo", + Callback( + before_inference_set=before_inference_set, + after_inference_set=after_inference_set, + before_inference_case=before_inference_case, + after_inference_case=after_inference_case, + before_evaluate_set=before_evaluate_set, + after_evaluate_set=after_evaluate_set, + before_evaluate_case=before_evaluate_case, + after_evaluate_case=after_evaluate_case, + ), + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + callbacks=callbacks, + ) + + expected = [ + "before_inference_set", + "before_inference_case", + "after_inference_case", + "after_inference_set", + "before_evaluate_set", + "before_evaluate_case", + "after_evaluate_case", + "after_evaluate_set", + ] + assert triggered == expected, triggered diff --git a/examples/evaluation/context_messages/.env b/examples/evaluation/context_messages/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/context_messages/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/context_messages/README.md b/examples/evaluation/context_messages/README.md new file mode 100644 index 0000000..76bd75c --- /dev/null +++ b/examples/evaluation/context_messages/README.md @@ -0,0 +1,20 @@ +# äøŠäø‹ę–‡ę³Øå…„ē¤ŗä¾‹ + +åœØčÆ„ęµ‹ē”Øä¾‹äøŠé…ē½® **context_messages**ļ¼ščÆ„ä¼°ęœåŠ”åœØęÆč½®ęŽØē†å‰ä¼šå°†å…¶äø­å†…å®¹ę³Øå…„ä¼ščÆäøŠäø‹ę–‡ļ¼Œē”ØäŗŽä¼ é€’ē³»ē»ŸčÆ“ę˜Žć€é¢†åŸŸēŸ„čÆ†ęˆ–ę ¼å¼ēŗ¦ęŸć€‚ + +## ē›®å½•ē»“ęž„ + +- `context_messages/`ļ¼šē¤ŗä¾‹ę ¹ē›®å½• +- `agent/`ļ¼šå†…å« `agent.py`态`context_example.evalset.json`(含带 context_messages ēš„ē”Øä¾‹ļ¼‰ć€`test_config.json`态`config.py` +- `test_context_messages.py`ļ¼šč°ƒē”Ø `AgentEvaluator.evaluate` 跑评测 + +## ēŽÆå¢ƒč¦ę±‚ + +Python 3.10+ć€‚éœ€é…ē½® `TRPC_AGENT_API_KEY` ē­‰ēŽÆå¢ƒå˜é‡ļ¼ˆåŒ quickstart)。 + +## 运蔌 + +```bash +cd examples/evaluation/context_messages +pytest test_context_messages.py -v --tb=short -s +``` diff --git a/examples/evaluation/context_messages/agent/__init__.py b/examples/evaluation/context_messages/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/context_messages/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/context_messages/agent/agent.py b/examples/evaluation/context_messages/agent/agent.py new file mode 100644 index 0000000..1539966 --- /dev/null +++ b/examples/evaluation/context_messages/agent/agent.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for context_messages example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + data = { + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + } + result = data.get(city, {"temperature": 20, "condition": "未矄"}) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="å¤©ę°”ęŸ„čÆ¢åŠ©ę‰‹", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="ä½ ę˜Æå¤©ę°”åŠ©ę‰‹ļ¼Œē”Ø get_weather ęŸ„čÆ¢åŸŽåø‚å¤©ę°”å¹¶ē®€č¦å›žē­”ć€‚", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/context_messages/agent/config.py b/examples/evaluation/context_messages/agent/config.py new file mode 100644 index 0000000..428fb8d --- /dev/null +++ b/examples/evaluation/context_messages/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/context_messages/agent/context_example.evalset.json b/examples/evaluation/context_messages/agent/context_example.evalset.json new file mode 100644 index 0000000..612200b --- /dev/null +++ b/examples/evaluation/context_messages/agent/context_example.evalset.json @@ -0,0 +1,47 @@ +{ + "eval_set_id": "context_messages_example", + "name": "äøŠäø‹ę–‡ę³Øå…„ē¤ŗä¾‹", + "description": "ē”Øä¾‹äø­é…ē½® context_messagesļ¼ŒåœØęÆč½®ęŽØē†å‰ę³Øå…„åˆ°ä¼ščÆ", + "eval_cases": [ + { + "eval_id": "with_context_001", + "context_messages": [ + { + "parts": [ + { + "text": "ä½ ę˜Æäø€äøŖå¤©ę°”åŠ©ę‰‹ļ¼ŒåŖå›žē­”äøŽå¤©ę°”ē›øå…³ēš„é—®é¢˜ļ¼Œå›žē­”č¦ē®€ēŸ­ć€‚" + } + ], + "role": "user" + } + ], + "conversation": [ + { + "invocation_id": "e-1", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/context_messages/agent/test_config.json b/examples/evaluation/context_messages/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/context_messages/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/context_messages/test_context_messages.py b/examples/evaluation/context_messages/test_context_messages.py new file mode 100644 index 0000000..ee32161 --- /dev/null +++ b/examples/evaluation/context_messages/test_context_messages.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""äøŠäø‹ę–‡ę³Øå…„ē¤ŗä¾‹ļ¼šåœØē”Øä¾‹äø­é…ē½® context_messagesļ¼ŒęÆč½®ęŽØē†å‰ę³Øå…„åˆ°ä¼ščÆć€‚""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_context_messages(): + """用例带 context_messagesļ¼ŒęŽØē†å‰ä¼šę³Øå…„åˆ°ä¼ščÆäøŠäø‹ę–‡ć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "context_example.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/custom_runner/.env b/examples/evaluation/custom_runner/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/custom_runner/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/custom_runner/README.md b/examples/evaluation/custom_runner/README.md new file mode 100644 index 0000000..64bede7 --- /dev/null +++ b/examples/evaluation/custom_runner/README.md @@ -0,0 +1,39 @@ +# č‡Ŗå®šä¹‰ Runner 示例 + +ęœ¬ē¤ŗä¾‹ę¼”ē¤ŗå¦‚ä½•åœØčÆ„ęµ‹äø­**ä¼ å…„č‡Ŗå®šä¹‰ Runner**ļ¼šęŽØē†ē”±ä½ ęä¾›ēš„ Runner(含 Agent态SessionService ē­‰ļ¼‰ę‰§č”Œļ¼Œę‰“åˆ†é€»č¾‘ä»ē”±čÆ„ęµ‹ę”†ęž¶å®Œęˆć€‚ + +## é€‚ē”Øåœŗę™Æ + +- å¤ē”Øå·²ęœ‰ä¼ščÆęœåŠ”ļ¼ˆå¦‚ Redis态SQL) +- äøŽēŗæäøŠ/ęœ¬åœ°éƒØē½²ä½æē”ØåŒäø€ Runner å½¢ę€ļ¼Œē»Ÿäø€é‰“ęƒć€ę—„åæ—ē­‰ +- åøŒęœ›čÆ„ęµ‹äøŽēœŸå®žčæč”ŒēŽÆå¢ƒäø€č‡“ + +## ē›®å½•ē»“ęž„ + +``` +custom_runner/ +ā”œā”€ā”€ agent/ +│ ā”œā”€ā”€ __init__.py +│ ā”œā”€ā”€ agent.py # 天气 Agent(root_agent) +│ ā”œā”€ā”€ config.py # ęØ”åž‹é…ē½® +│ ā”œā”€ā”€ test_config.json # čÆ„ęµ‹ęŒ‡ę ‡é…ē½® +│ └── custom_runner_example.evalset.json # 评测集 +ā”œā”€ā”€ test_custom_runner.py # Ꞅ造 Runner å¹¶č°ƒē”Ø evaluate(..., runner=runner) +└── README.md +``` + +## čæč”Œę–¹å¼ + +在 **ęœ¬ē›®å½•** äø‹ę‰§č”Œļ¼ˆéœ€å·²é…ē½® `TRPC_AGENT_API_KEY` ē­‰ēŽÆå¢ƒå˜é‡ļ¼‰ļ¼š + +```bash +pytest test_custom_runner.py -v -s +``` + +## 要点 + +1. **Ꞅ造 Runner**ļ¼šä½æē”Ø `Runner(app_name=..., agent=..., session_service=...)`ļ¼Œęœ¬ē¤ŗä¾‹ä½æē”Ø `InMemorySessionService()`ļ¼ŒåÆęŒ‰éœ€ę›æę¢äøŗå…¶ä»– SessionService怂 +2. **传兄评测**:在 `AgentEvaluator.evaluate(..., runner=runner)` ꈖ `get_executer(..., runner=runner)` äø­ä¼ å…„ä½ ēš„ Runner怂 +3. **session_input**ļ¼šč‹„čÆ„ęµ‹ē”Øä¾‹äø­é…ē½®äŗ† `session_input`ļ¼Œę”†ęž¶ä¼šęŒ‰éœ€åœØčÆ„ Runner ēš„ä¼ščÆäø­åˆ›å»ŗ/ę›“ę–°ä¼ščÆć€‚ + +ę›“å¤ščÆ“ę˜Žč§ę–‡ę”£ [run_eval_pytest.md](../../../docs/evaluation/run_eval_pytest.md) äø­ēš„ć€Œč‡Ŗå®šä¹‰ Runnerć€å°čŠ‚ć€‚ diff --git a/examples/evaluation/custom_runner/agent/__init__.py b/examples/evaluation/custom_runner/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/custom_runner/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/custom_runner/agent/agent.py b/examples/evaluation/custom_runner/agent/agent.py new file mode 100644 index 0000000..3be5b7e --- /dev/null +++ b/examples/evaluation/custom_runner/agent/agent.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for custom runner example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + data = { + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + } + result = data.get(city, {"temperature": 20, "condition": "未矄"}) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="å¤©ę°”ęŸ„čÆ¢åŠ©ę‰‹", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="ä½ ę˜Æå¤©ę°”åŠ©ę‰‹ļ¼Œē”Ø get_weather ęŸ„čÆ¢åŸŽåø‚å¤©ę°”å¹¶ē®€č¦å›žē­”ć€‚", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/custom_runner/agent/config.py b/examples/evaluation/custom_runner/agent/config.py new file mode 100644 index 0000000..428fb8d --- /dev/null +++ b/examples/evaluation/custom_runner/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json b/examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json new file mode 100644 index 0000000..8a1d80e --- /dev/null +++ b/examples/evaluation/custom_runner/agent/custom_runner_example.evalset.json @@ -0,0 +1,37 @@ +{ + "eval_set_id": "custom_runner_example", + "name": "č‡Ŗå®šä¹‰ Runner 示例", + "description": "使用自建 Runner(含 agent äøŽ session_serviceļ¼‰č·‘čÆ„ęµ‹ļ¼Œę‰“åˆ†ä»ē”±ę”†ęž¶å®Œęˆ", + "eval_cases": [ + { + "eval_id": "custom_runner_001", + "conversation": [ + { + "invocation_id": "e-1", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/custom_runner/agent/test_config.json b/examples/evaluation/custom_runner/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/custom_runner/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/custom_runner/test_custom_runner.py b/examples/evaluation/custom_runner/test_custom_runner.py new file mode 100644 index 0000000..e243d11 --- /dev/null +++ b/examples/evaluation/custom_runner/test_custom_runner.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""č‡Ŗå®šä¹‰ Runner ē¤ŗä¾‹ļ¼šä½æē”Øč‡Ŗå»ŗ Runner(agent + session_service)跑评测。""" + +import os +import pytest + +from trpc_agent_sdk.evaluation import AgentEvaluator +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService + +# ä»Žå½“å‰ē¤ŗä¾‹ēš„ agent åŒ…åŠ č½½ root_agent +from agent import root_agent + + +@pytest.mark.asyncio +async def test_evaluate_with_custom_runner(): + """ä½æē”Øč‡Ŗå®šä¹‰ Runner ę‰§č”ŒčÆ„ęµ‹ļ¼šč‡Ŗå»ŗ Runner č“Ÿč“£ęŽØē†ļ¼Œę‰“åˆ†ē”±ę”†ęž¶å®Œęˆć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "custom_runner_example.evalset.json" + ) + + # č‡Ŗå»ŗä¼ščÆęœåŠ”ļ¼ˆåÆę›æę¢äøŗ Redis/SQL 等) + session_service = InMemorySessionService() + + # Ꞅ造 Runnerļ¼šäøŽēŗæäøŠ/ęœ¬åœ°éƒØē½²ä½æē”ØåŒäø€ Runner å½¢ę€ļ¼Œä¾æäŗŽå¤ē”ØēŽÆå¢ƒ + runner = Runner( + app_name="weather_agent", + agent=root_agent, + session_service=session_service, + ) + + # ä¼ å…„ runner åŽļ¼ŒęŽØē†ē”±čÆ„ Runner ę‰§č”Œļ¼Œę‰“åˆ†ä»ē”±čÆ„ęµ‹ę”†ęž¶å®Œęˆ + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + runner=runner, + ) diff --git a/examples/evaluation/llm_final_response/.env b/examples/evaluation/llm_final_response/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_final_response/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_final_response/README.md b/examples/evaluation/llm_final_response/README.md new file mode 100644 index 0000000..34e6cd9 --- /dev/null +++ b/examples/evaluation/llm_final_response/README.md @@ -0,0 +1,23 @@ +# llm_final_response 评估器示例 + +使用 **llm_final_response** ęŒ‡ę ‡ļ¼šē”±č£åˆ¤ęØ”åž‹åÆ¹ęÆ” Agent å®žé™…ęœ€ē»ˆå›žē­”äøŽčÆ„ęµ‹é›†äø­ēš„å‚č€ƒē­”ę”ˆļ¼Œē»™å‡ŗę˜Æå¦äø€č‡“ēš„åˆ¤å®šć€‚ + +## ē›®å½•ē»“ęž„ + +- `agent/`:Agent ęØ”å—ļ¼ˆ`agent.py`态`config.py`)、评测集 `llm_final_response.evalset.json`态`test_config.json` +- `test_llm_final_response.py`:pytest 兄口 + +## ēŽÆå¢ƒå˜é‡ + +- `TRPC_AGENT_API_KEY` ꈖ `API_KEY`ļ¼ˆåæ…å”«ļ¼ŒAgent äøŽč£åˆ¤ęØ”åž‹å…±ē”Øļ¼‰ +- `TRPC_AGENT_BASE_URL`ļ¼ˆåÆé€‰ļ¼‰ +- `TRPC_AGENT_MODEL_NAME`ļ¼ˆåÆé€‰ļ¼Œé»˜č®¤ glm-4-flash) + +## 运蔌 + +```bash +cd examples/evaluation/llm_final_response +pytest test_llm_final_response.py -v --tb=short -s +``` + +čÆ„ęµ‹é›†ē”Øä¾‹äø­éœ€ęä¾›é¢„ęœŸēš„ `final_response`ļ¼Œč£åˆ¤ęØ”åž‹å°†å®žé™…å›žē­”äøŽčÆ„å‚č€ƒåÆ¹ęÆ”åŽč¾“å‡ŗ valid/invalid怂 diff --git a/examples/evaluation/llm_final_response/agent/__init__.py b/examples/evaluation/llm_final_response/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_final_response/agent/agent.py b/examples/evaluation/llm_final_response/agent/agent.py new file mode 100644 index 0000000..e42f996 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/agent.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Simple QA agent for llm_final_response evaluator demo.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + weather_data = { + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "深圳": {"temperature": 25, "condition": "ꙓ"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未矄"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the agent for llm_final_response demo.""" + return LlmAgent( + name="llm_final_response_agent", + description="ē®€å•é—®ē­”åŠ©ę‰‹ļ¼ŒåÆęŸ„å¤©ę°”", + model=_create_model(), + instruction=( + "ä½ ę˜Æé—®ē­”åŠ©ę‰‹ć€‚ē”Øęˆ·é—®å¤©ę°”ę—¶ē”Ø get_weather ęŸ„čÆ¢åŽē®€ę“å›žē­”ļ¼Œ" + "ä¾‹å¦‚åŖå›žē­”ęø©åŗ¦äøŽå¤©ę°”ēŠ¶å†µć€‚" + ), + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_final_response/agent/config.py b/examples/evaluation/llm_final_response/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json b/examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json new file mode 100644 index 0000000..3c78d36 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/llm_final_response.evalset.json @@ -0,0 +1,31 @@ +{ + "eval_set_id": "llm_final_response", + "name": "LLM ęœ€ē»ˆå“åŗ”čÆ„ä¼°ē¤ŗä¾‹", + "description": "使用 llm_final_response čÆ„ä¼°å™ØåÆ¹ęÆ”å®žé™…å›žē­”äøŽå‚č€ƒē­”ę”ˆ", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "äøŠęµ·å½“å‰å¤©ę°”å¤šäŗ‘ļ¼Œę°”ęø© 18°C怂"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [{"id": "t1", "name": "get_weather", "args": {"city": "上海"}}] + } + } + ], + "session_input": { + "app_name": "llm_final_response_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_final_response/agent/test_config.json b/examples/evaluation/llm_final_response/agent/test_config.json new file mode 100644 index 0000000..f7b5ad1 --- /dev/null +++ b/examples/evaluation/llm_final_response/agent/test_config.json @@ -0,0 +1,19 @@ +{ + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "num_samples": 1, + "generation_config": {"max_tokens": 2000, "temperature": 0.2} + } + } + } + } + ] +} diff --git a/examples/evaluation/llm_final_response/test_llm_final_response.py b/examples/evaluation/llm_final_response/test_llm_final_response.py new file mode 100644 index 0000000..af85d64 --- /dev/null +++ b/examples/evaluation/llm_final_response/test_llm_final_response.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""llm_final_response 评估器示例测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_llm_final_response_demo(): + """使用 llm_final_response ęŒ‡ę ‡čÆ„ęµ‹ Agentļ¼šč£åˆ¤ęØ”åž‹åÆ¹ęÆ”å®žé™…å›žē­”äøŽå‚č€ƒē­”ę”ˆć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "llm_final_response.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_final_response_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/llm_judge_tools/.env b/examples/evaluation/llm_judge_tools/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_judge_tools/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_judge_tools/README.md b/examples/evaluation/llm_judge_tools/README.md new file mode 100644 index 0000000..843f3ca --- /dev/null +++ b/examples/evaluation/llm_judge_tools/README.md @@ -0,0 +1,63 @@ +# 为裁判 Agent ę³Øå†Œå·„å…·ē¤ŗä¾‹ļ¼ˆRubric ęŒ‡ę ‡ļ¼‰ + +é€ščæ‡ **LLM_EVALUATOR_REGISTRY** äøŗ LLM Judge ēš„č£åˆ¤ Agent ę³Øå†Œå·„å…·ļ¼Œå¹¶åœØ **rubric ꌇꠇ**(`llm_rubric_response`ļ¼‰ēš„ē»†åˆ™äø­**č§„å®šå·„å…·ēš„č°ƒē”Øę—¶ęœŗå’Œē”Øę³•**ļ¼Œä½æč£åˆ¤åœØę‰“åˆ†ę—¶åæ…é”»å…ˆč°ƒē”Øå·„å…·čŽ·å–åˆ¤å®šę ‡å‡†ļ¼Œå†ęŒ‰ę ‡å‡†ę”ę¬¾åˆ¤å®šć€‚ + +## åŽŸē† + +- ęœ¬ē¤ŗä¾‹ä½æē”Ø **llm_rubric_response** ęŒ‡ę ‡ļ¼ˆč€Œéž llm_final_responseļ¼‰ļ¼Œé€ščæ‡å¤šę” Rubric ēŗ¦ęŸč£åˆ¤č”Œäøŗć€‚ +- 在 `test_config.json` ēš„ **rubrics** äø­ę˜Žē”®å†™å‡ŗļ¼š + - **č°ƒē”Øę—¶ęœŗ**ļ¼šč£åˆ¤åœØę‰“åˆ†å‰åæ…é”»å…ˆč°ƒē”Ø `get_eval_policy` čŽ·å–ęœ¬ē”Øä¾‹ēš„åˆ¤å®šę ‡å‡†ć€‚ + - **用法**ļ¼šä»…ę ¹ę®čÆ„å·„å…·čæ”å›žēš„ę ‡å‡†ę”ę¬¾čæ›č”Œåˆ¤å®šļ¼Œäøå¾—č‡Ŗč”Œå¢žåŠ ęˆ–å‡å°‘ę”ę¬¾ļ¼›åŽē»­ē»†åˆ™č¦ę±‚ć€Œę ¹ę® get_eval_policy čæ”å›žēš„ę ‡å‡†ē¬¬ N ę”åˆ¤å®šć€ć€‚ +- č£åˆ¤ē”±ę”†ęž¶å†…ēš„ **LlmAgent** ę‹…ä»»ļ¼›é€ščæ‡ `LLM_EVALUATOR_REGISTRY.register_judge_tools("llm_rubric_response", [FunctionTool(get_eval_policy)])` äøŗč£åˆ¤ę³Øå…„ **get_eval_policy** å·„å…·ć€‚č£åˆ¤ęØ”åž‹åœØęŽØē†ę—¶ä¼šēœ‹åˆ° rubricsļ¼Œå› ę­¤ä¼šå…ˆč°ƒē”Øå·„å…·å†ęŒ‰ę”ę¬¾ę‰“åˆ†ļ¼Œå·„å…·ę•ˆęžœę˜Žę˜¾ć€‚ + +## ē›®å½•ē»“ęž„ + +- `agent/`ļ¼šč¢«čÆ„ęµ‹ēš„ Agent、评测集 `judge_tools.evalset.json`态`test_config.json`(含 llm_rubric_response äøŽč§„å®šå·„å…·ē”Øę³•ēš„ rubrics) +- `test_llm_judge_tools.py`ļ¼šę³Øå†Œ judge å·„å…· `get_eval_policy` å¹¶ę‰§č”ŒčÆ„ęµ‹ + +## 示例代码 + +åœØčæč”ŒčÆ„ęµ‹å‰äøŗ **llm_rubric_response** ę³Øå†Œå·„å…·ļ¼š + +```python +from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY +from trpc_agent_sdk.tools import FunctionTool + +def get_eval_policy() -> str: + """č£åˆ¤åœØę‰“åˆ†å‰åæ…é”»č°ƒē”Øļ¼ščæ”å›žęœ¬ē”Øä¾‹ēš„åˆ¤å®šę ‡å‡†ć€‚""" + return ( + "ęœ¬ē”Øä¾‹åˆ¤å®šę ‡å‡†ļ¼ˆå…± 3 ę”ļ¼‰ļ¼š\n" + "1. ęœ€ē»ˆå›žē­”é”»åŒ…å«ę˜Žē”®ēš„ęø©åŗ¦ę•°å€¼ć€‚\n" + "2. ęœ€ē»ˆå›žē­”é”»åŒ…å«å¤©ę°”ēŠ¶å†µęčæ°ć€‚\n" + "3. å›žē­”é”»äøŽē”Øęˆ·é—®é¢˜ē›“ęŽ„ē›øå…³ć€‚" + ) + +LLM_EVALUATOR_REGISTRY.register_judge_tools( + "llm_rubric_response", + [FunctionTool(get_eval_policy)], +) +``` + +在 `test_config.json` ēš„ rubrics äø­č§„å®šč°ƒē”Øę—¶ęœŗäøŽē”Øę³•ļ¼ˆč§ `agent/test_config.json`ļ¼‰ļ¼š + +- ē¬¬äø€ę” rubricļ¼šč£åˆ¤åœØę‰“åˆ†å‰åæ…é”»å…ˆč°ƒē”Ø `get_eval_policy`ļ¼Œå†ä»…ę ¹ę®čæ”å›žēš„ę”ę¬¾åˆ¤å®šć€‚ +- 其余 rubricsļ¼šę ¹ę® `get_eval_policy` čæ”å›žēš„ę ‡å‡†ē¬¬ 1态2态3 ę”é€ę”åˆ¤å®šć€‚ + +å–ę¶ˆę³Øå†Œļ¼š + +```python +LLM_EVALUATOR_REGISTRY.unregister_judge_tools("llm_rubric_response") +``` + +## ēŽÆå¢ƒå˜é‡ + +- `TRPC_AGENT_API_KEY` ꈖ `API_KEY`ļ¼ˆåæ…å”«ļ¼ŒAgent äøŽč£åˆ¤ęØ”åž‹å…±ē”Øļ¼‰ +- `TRPC_AGENT_BASE_URL`ļ¼ˆåÆé€‰ļ¼‰ +- `TRPC_AGENT_MODEL_NAME`ļ¼ˆåÆé€‰ļ¼‰ + +## 运蔌 + +```bash +cd examples/evaluation/llm_judge_tools +pytest test_llm_judge_tools.py -v --tb=short -s +``` diff --git a/examples/evaluation/llm_judge_tools/agent/__init__.py b/examples/evaluation/llm_judge_tools/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_judge_tools/agent/agent.py b/examples/evaluation/llm_judge_tools/agent/agent.py new file mode 100644 index 0000000..f39b45e --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/agent.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Simple QA agent for llm_judge_tools example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + weather_data = { + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "深圳": {"temperature": 25, "condition": "ꙓ"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未矄"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the agent for llm_judge_tools demo.""" + return LlmAgent( + name="llm_judge_tools_agent", + description="ē®€å•é—®ē­”åŠ©ę‰‹ļ¼ŒåÆęŸ„å¤©ę°”", + model=_create_model(), + instruction=( + "ä½ ę˜Æé—®ē­”åŠ©ę‰‹ć€‚ē”Øęˆ·é—®å¤©ę°”ę—¶ē”Ø get_weather ęŸ„čÆ¢åŽē®€ę“å›žē­”ļ¼Œ" + "ä¾‹å¦‚åŖå›žē­”ęø©åŗ¦äøŽå¤©ę°”ēŠ¶å†µć€‚" + ), + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_judge_tools/agent/config.py b/examples/evaluation/llm_judge_tools/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json b/examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json new file mode 100644 index 0000000..46a9f7f --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/judge_tools.evalset.json @@ -0,0 +1,31 @@ +{ + "eval_set_id": "llm_judge_tools", + "name": "Judge ę³Øå†Œå·„å…·ē¤ŗä¾‹", + "description": "使用 llm_rubric_response,在 rubric äø­č§„å®šč£åˆ¤é”»å…ˆč°ƒē”Ø get_eval_policy å†ęŒ‰čæ”å›žę”ę¬¾åˆ¤å®š", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "äøŠęµ·å½“å‰å¤©ę°”å¤šäŗ‘ļ¼Œę°”ęø© 18°C怂"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [{"id": "t1", "name": "get_weather", "args": {"city": "上海"}}] + } + } + ], + "session_input": { + "app_name": "llm_judge_tools_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_judge_tools/agent/test_config.json b/examples/evaluation/llm_judge_tools/agent/test_config.json new file mode 100644 index 0000000..2b6bc7b --- /dev/null +++ b/examples/evaluation/llm_judge_tools/agent/test_config.json @@ -0,0 +1,53 @@ +{ + "metrics": [ + { + "metric_name": "llm_rubric_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "num_samples": 1, + "generation_config": {"max_tokens": 2000, "temperature": 0.2} + }, + "rubrics": [ + { + "id": "call_tool_first", + "content": { + "text": "č£åˆ¤åœØę‰“åˆ†å‰åæ…é”»å…ˆč°ƒē”Ø get_eval_policy čŽ·å–ęœ¬ē”Øä¾‹ēš„åˆ¤å®šę ‡å‡†ļ¼Œå†ä»…ę ¹ę®čÆ„å·„å…·čæ”å›žēš„ę ‡å‡†ę”ę¬¾čæ›č”Œåˆ¤å®šļ¼Œäøå¾—č‡Ŗč”Œå¢žåŠ ęˆ–å‡å°‘ę”ę¬¾ć€‚" + }, + "description": "åˆ¤å®šå‰é”»č°ƒē”Øå·„å…·čŽ·å–ę ‡å‡†", + "type": "JUDGE_TOOL_USAGE" + }, + { + "id": "clause_1", + "content": { + "text": "ę ¹ę® get_eval_policy čæ”å›žēš„ę ‡å‡†ē¬¬ 1 ę”åˆ¤å®šļ¼šęœ€ē»ˆå›žē­”ę˜Æå¦ę»”č¶³čÆ„ę”ć€‚" + }, + "description": "ęŒ‰ę”æē­–ē¬¬ 1 ę”åˆ¤å®š", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "clause_2", + "content": { + "text": "ę ¹ę® get_eval_policy čæ”å›žēš„ę ‡å‡†ē¬¬ 2 ę”åˆ¤å®šļ¼šęœ€ē»ˆå›žē­”ę˜Æå¦ę»”č¶³čÆ„ę”ć€‚" + }, + "description": "ęŒ‰ę”æē­–ē¬¬ 2 ę”åˆ¤å®š", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "clause_3", + "content": { + "text": "ę ¹ę® get_eval_policy čæ”å›žēš„ę ‡å‡†ē¬¬ 3 ę”åˆ¤å®šļ¼šęœ€ē»ˆå›žē­”ę˜Æå¦ę»”č¶³čÆ„ę”ć€‚" + }, + "description": "ęŒ‰ę”æē­–ē¬¬ 3 ę”åˆ¤å®š", + "type": "RELEVANCE" + } + ] + } + } + } + ] +} diff --git a/examples/evaluation/llm_judge_tools/test_llm_judge_tools.py b/examples/evaluation/llm_judge_tools/test_llm_judge_tools.py new file mode 100644 index 0000000..33ca41d --- /dev/null +++ b/examples/evaluation/llm_judge_tools/test_llm_judge_tools.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""为裁判 Agent ę³Øå†Œå·„å…·ēš„ē¤ŗä¾‹ļ¼šä½æē”Ø llm_rubric_response,在 rubric äø­č§„å®šå·„å…·ēš„č°ƒē”Øę—¶ęœŗäøŽē”Øę³•ć€‚""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator, LLM_EVALUATOR_REGISTRY +from trpc_agent_sdk.tools import FunctionTool + + +def get_eval_policy() -> str: + """č£åˆ¤åœØę‰“åˆ†å‰åæ…é”»č°ƒē”Øēš„å·„å…·ļ¼ščæ”å›žęœ¬ē”Øä¾‹ēš„åˆ¤å®šę ‡å‡†ć€‚č£åˆ¤é”»å…ˆč°ƒē”Øęœ¬å·„å…·čŽ·å–ę ‡å‡†ļ¼Œå†ä»…ęŒ‰čæ”å›žēš„ę”ę¬¾é€ę”åˆ¤å®šć€‚""" + return ( + "ęœ¬ē”Øä¾‹åˆ¤å®šę ‡å‡†ļ¼ˆå…± 3 ę”ļ¼‰ļ¼š\n" + "1. ęœ€ē»ˆå›žē­”é”»åŒ…å«ę˜Žē”®ēš„ęø©åŗ¦ę•°å€¼ļ¼ˆå¦‚ 18态18°C)。\n" + "2. ęœ€ē»ˆå›žē­”é”»åŒ…å«å¤©ę°”ēŠ¶å†µęčæ°ļ¼ˆå¦‚ę™“ć€å¤šäŗ‘ć€é˜“ļ¼‰ć€‚\n" + "3. å›žē­”é”»äøŽē”Øęˆ·é—®é¢˜ē›“ęŽ„ē›øå…³ļ¼Œäøå¾—ē­”éžę‰€é—®ć€‚" + ) + + +# äøŗ llm_rubric_response ēš„ judge agent ę³Øå†Œå·„å…·ļ¼›rubric äø­å·²č§„å®šć€Œåæ…é”»å…ˆč°ƒē”Ø get_eval_policy å†ęŒ‰čæ”å›žę”ę¬¾åˆ¤å®šć€ +LLM_EVALUATOR_REGISTRY.register_judge_tools( + "llm_rubric_response", + [FunctionTool(get_eval_policy)], +) + + +@pytest.mark.asyncio +async def test_llm_judge_with_tools(): + """使用 llm_rubric_responseļ¼šč£åˆ¤ęŒ‰ rubric é”»å…ˆč°ƒē”Ø get_eval_policy čŽ·å–åˆ¤å®šę ‡å‡†ļ¼Œå†ęŒ‰ę ‡å‡†ę”ę¬¾ę‰“åˆ†ć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "judge_tools.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_judge_tools_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/llm_rubric_knowledge_recall/.env b/examples/evaluation/llm_rubric_knowledge_recall/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_rubric_knowledge_recall/README.md b/examples/evaluation/llm_rubric_knowledge_recall/README.md new file mode 100644 index 0000000..6a9d960 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/README.md @@ -0,0 +1,23 @@ +# llm_rubric_knowledge_recall 评估器示例 + +使用 **llm_rubric_knowledge_recall** ęŒ‡ę ‡ļ¼šä»Ž Agent å®žé™…č½Øčæ¹äø­ęå–**ēŸ„čÆ†ę£€ē“¢ē±»å·„å…·**(默认 `knowledge_search`ļ¼‰ēš„č°ƒē”Øē»“ęžœļ¼Œē”±č£åˆ¤ęØ”åž‹ę ¹ę® **rubrics** åˆ¤å®šę£€ē“¢å†…å®¹ę˜Æå¦č¶³ä»„ę”Æę’‘é—®é¢˜ęˆ–ē»†åˆ™ļ¼Œé€‚ē”ØäŗŽ RAG å¬å›žč“Øé‡čÆ„ä¼°ć€‚ + +## ē›®å½•ē»“ęž„ + +- `agent/`:Agent åø¦ `knowledge_search` 巄具、评测集 `llm_rubric_knowledge_recall.evalset.json`态`test_config.json`(含 `rubrics` äøŽåÆé€‰ `knowledge_tool_names`) +- `test_llm_rubric_knowledge_recall.py`:pytest 兄口 + +## ēŽÆå¢ƒå˜é‡ + +- `TRPC_AGENT_API_KEY` ꈖ `API_KEY`ļ¼ˆåæ…å”«ļ¼‰ +- `TRPC_AGENT_BASE_URL`ļ¼ˆåÆé€‰ļ¼‰ +- `TRPC_AGENT_MODEL_NAME`ļ¼ˆåÆé€‰ļ¼Œé»˜č®¤ glm-4-flash) + +## 运蔌 + +```bash +cd examples/evaluation/llm_rubric_knowledge_recall +pytest test_llm_rubric_knowledge_recall.py -v --tb=short -s +``` + +Agent åæ…é”»åœØå®žé™…čæč”Œäø­č°ƒē”Ø `knowledge_search`ļ¼ˆęˆ–ä½ åœØ `knowledge_tool_names` äø­é…ē½®ēš„å·„å…·åļ¼‰ļ¼Œå¦åˆ™č½Øčæ¹äø­ę— ę£€ē“¢ē»“ęžœļ¼Œč£åˆ¤ę— ę³•ēØ³å®šę‰“åˆ†ć€‚ęœ¬ē¤ŗä¾‹äø­ `knowledge_search` čæ”å›žęØ”ę‹Ÿę–‡ę”£ļ¼Œč£åˆ¤ę®ę­¤äøŽ rubrics åˆ¤å®šć€‚ diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py b/examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py b/examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py new file mode 100644 index 0000000..e1655de --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/agent.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent with knowledge_search for llm_rubric_knowledge_recall evaluator demo.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def knowledge_search(query: str, top_k: int = 3) -> Dict[str, Any]: + """ēŸ„čÆ†ę£€ē“¢ļ¼šę ¹ę® query čæ”å›žęØ”ę‹Ÿēš„ę£€ē“¢ē»“ęžœļ¼ˆē¤ŗä¾‹ē”Øļ¼‰ć€‚""" + # ęØ”ę‹Ÿę£€ē“¢ē»“ęžœļ¼Œč£åˆ¤å°†ę ¹ę®čæ™äŗ›å†…å®¹äøŽ rubrics åˆ¤å®šå¬å›žč“Øé‡ + mock_docs = [ + {"title": "产品A", "content": "产品A é€‚ē”ØäŗŽä¼äøšåä½œļ¼Œę”ÆęŒę–‡ę”£äøŽä»»åŠ”ē®”ē†ć€‚"}, + {"title": "产品B", "content": "产品B ęä¾› API äøŽ SDKļ¼Œä¾æäŗŽé›†ęˆć€‚"}, + {"title": "产品C", "content": "产品C é¢å‘äøŖäŗŗē”Øęˆ·ļ¼Œęä¾›ē¬”č®°äøŽå¾…åŠžć€‚"}, + ] + return { + "query": query, + "results": mock_docs[:top_k], + } + + +def create_agent() -> LlmAgent: + """Create the agent for llm_rubric_knowledge_recall demo.""" + return LlmAgent( + name="llm_rubric_knowledge_recall_agent", + description="åø¦ēŸ„čÆ†ę£€ē“¢ēš„é—®ē­”åŠ©ę‰‹", + model=_create_model(), + instruction=( + "ä½ ę˜ÆēŸ„čÆ†é—®ē­”åŠ©ę‰‹ć€‚ē”Øęˆ·ęé—®ę—¶å…ˆč°ƒē”Ø knowledge_search ę£€ē“¢ē›øå…³ēŸ„čÆ†ļ¼Œ" + "å†ę ¹ę®ę£€ē“¢ē»“ęžœē»„ē»‡å›žē­”ć€‚åæ…é”»č°ƒē”Ø knowledge_search åŽå†å›žē­”ć€‚" + ), + tools=[FunctionTool(knowledge_search)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/config.py b/examples/evaluation/llm_rubric_knowledge_recall/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json b/examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json new file mode 100644 index 0000000..dfb97a6 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/llm_rubric_knowledge_recall.evalset.json @@ -0,0 +1,33 @@ +{ + "eval_set_id": "llm_rubric_knowledge_recall", + "name": "LLM ē»†åˆ™ēŸ„čÆ†åŗ“å¬å›žčÆ„ä¼°ē¤ŗä¾‹", + "description": "使用 llm_rubric_knowledge_recall čÆ„ä¼°å™Øåˆ¤å®šę£€ē“¢åˆ°ēš„ēŸ„čÆ†ę˜Æå¦ę”Æę’‘ rubrics", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "å…¬åøęœ‰å“Ŗäŗ›äŗ§å“ļ¼ŸčÆ·ē®€å•ä»‹ē»"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "ę ¹ę®ę£€ē“¢ē»“ęžœļ¼Œå…¬åøęœ‰äŗ§å“Aļ¼ˆä¼äøšåä½œļ¼‰ć€äŗ§å“B(API/SDK)、产品Cļ¼ˆäøŖäŗŗē¬”č®°äøŽå¾…åŠžļ¼‰ć€‚"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + {"id": "t1", "name": "knowledge_search", "args": {"query": "å…¬åøäŗ§å“", "top_k": 3}} + ] + } + } + ], + "session_input": { + "app_name": "llm_rubric_knowledge_recall_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json b/examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json new file mode 100644 index 0000000..c2f831d --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/agent/test_config.json @@ -0,0 +1,32 @@ +{ + "metrics": [ + { + "metric_name": "llm_rubric_knowledge_recall", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}" + }, + "rubrics": [ + { + "id": "coverage", + "content": {"text": "ę£€ē“¢åˆ°ēš„å†…å®¹é”»č¦†ē›–é—®é¢˜äø­ēš„å…³é”®äæ”ęÆļ¼ˆå¦‚äŗ§å“ć€åŠŸčƒ½ļ¼‰"}, + "description": "å¬å›žč¦†ē›–", + "type": "KNOWLEDGE_COVERAGE" + }, + { + "id": "relevance", + "content": {"text": "ę£€ē“¢ē»“ęžœé”»äøŽē”Øęˆ·é—®é¢˜ē›øå…³"}, + "description": "å¬å›žē›øå…³ę€§", + "type": "KNOWLEDGE_RELEVANCE" + } + ], + "knowledge_tool_names": ["knowledge_search"] + } + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py b/examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py new file mode 100644 index 0000000..92fde20 --- /dev/null +++ b/examples/evaluation/llm_rubric_knowledge_recall/test_llm_rubric_knowledge_recall.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""llm_rubric_knowledge_recall 评估器示例测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_llm_rubric_knowledge_recall_demo(): + """使用 llm_rubric_knowledge_recall ęŒ‡ę ‡ļ¼šč£åˆ¤ę ¹ę®č½Øčæ¹äø­ēš„ēŸ„čÆ†ę£€ē“¢ē»“ęžœäøŽ rubrics åˆ¤å®šå¬å›žč“Øé‡ć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "llm_rubric_knowledge_recall.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_rubric_knowledge_recall_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/llm_rubric_response/.env b/examples/evaluation/llm_rubric_response/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/llm_rubric_response/README.md b/examples/evaluation/llm_rubric_response/README.md new file mode 100644 index 0000000..ad9304c --- /dev/null +++ b/examples/evaluation/llm_rubric_response/README.md @@ -0,0 +1,23 @@ +# llm_rubric_response 评估器示例 + +使用 **llm_rubric_response** ęŒ‡ę ‡ļ¼šč£åˆ¤ęØ”åž‹ę ¹ę®é…ē½®ēš„ **rubrics**ļ¼ˆčÆ„ä¼°ē»†åˆ™ļ¼‰é€ę”åˆ¤å®š Agent ęœ€ē»ˆå›žē­”ę˜Æå¦ę»”č¶³ļ¼Œå•č½®åˆ†ę•°äøŗå„ē»†åˆ™å¾—åˆ†å¹³å‡å€¼ć€‚ + +## ē›®å½•ē»“ęž„ + +- `agent/`:Agent ęØ”å—ć€čÆ„ęµ‹é›† `llm_rubric_response.evalset.json`态`test_config.json`(含 `rubrics`) +- `test_llm_rubric_response.py`:pytest 兄口 + +## ēŽÆå¢ƒå˜é‡ + +- `TRPC_AGENT_API_KEY` ꈖ `API_KEY`ļ¼ˆåæ…å”«ļ¼‰ +- `TRPC_AGENT_BASE_URL`ļ¼ˆåÆé€‰ļ¼‰ +- `TRPC_AGENT_MODEL_NAME`ļ¼ˆåÆé€‰ļ¼Œé»˜č®¤ glm-4-flash) + +## 运蔌 + +```bash +cd examples/evaluation/llm_rubric_response +pytest test_llm_rubric_response.py -v --tb=short -s +``` + +`test_config.json` äø­éœ€é…ē½® `criterion.llm_judge.judge_model` äøŽ `criterion.llm_judge.rubrics`ļ¼ŒęÆę” rubric ēš„ `content.text` ä¼šå±•ē¤ŗē»™č£åˆ¤ęØ”åž‹ē”ØäŗŽåˆ¤å®šć€‚ diff --git a/examples/evaluation/llm_rubric_response/agent/__init__.py b/examples/evaluation/llm_rubric_response/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/llm_rubric_response/agent/agent.py b/examples/evaluation/llm_rubric_response/agent/agent.py new file mode 100644 index 0000000..50c923a --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/agent.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Simple agent for llm_rubric_response evaluator demo.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + weather_data = { + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "深圳": {"temperature": 25, "condition": "ꙓ"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未矄"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the agent for llm_rubric_response demo.""" + return LlmAgent( + name="llm_rubric_response_agent", + description="ē®€å•é—®ē­”åŠ©ę‰‹ļ¼ŒåÆęŸ„å¤©ę°”", + model=_create_model(), + instruction=( + "ä½ ę˜Æé—®ē­”åŠ©ę‰‹ć€‚ē”Øęˆ·é—®å¤©ę°”ę—¶ē”Ø get_weather ęŸ„čÆ¢åŽå›žē­”ć€‚" + "å›žē­”é”»åŒ…å«ę˜Žē”®ē»“č®ŗļ¼ˆå¦‚ęø©åŗ¦ć€å¤©ę°”ēŠ¶å†µļ¼‰ļ¼Œäø”äøŽē”Øęˆ·é—®é¢˜ē›“ęŽ„ē›øå…³ć€‚" + ), + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/llm_rubric_response/agent/config.py b/examples/evaluation/llm_rubric_response/agent/config.py new file mode 100644 index 0000000..5f1c9c4 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4-flash") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json b/examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json new file mode 100644 index 0000000..8b4cacd --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/llm_rubric_response.evalset.json @@ -0,0 +1,31 @@ +{ + "eval_set_id": "llm_rubric_response", + "name": "LLM ē»†åˆ™å“åŗ”čÆ„ä¼°ē¤ŗä¾‹", + "description": "使用 llm_rubric_response čÆ„ä¼°å™ØęŒ‰ rubrics åˆ¤å®šęœ€ē»ˆå›žē­”č“Øé‡", + "eval_cases": [ + { + "eval_id": "case_001", + "conversation": [ + { + "invocation_id": "inv-001", + "user_content": { + "parts": [{"text": "äøŠęµ·ä»Šå¤©å¤©ę°”å¦‚ä½•"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "äøŠęµ·å½“å‰å¤šäŗ‘ļ¼Œ18°C怂"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [{"id": "t1", "name": "get_weather", "args": {"city": "上海"}}] + } + } + ], + "session_input": { + "app_name": "llm_rubric_response_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_response/agent/test_config.json b/examples/evaluation/llm_rubric_response/agent/test_config.json new file mode 100644 index 0000000..995a9e6 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/agent/test_config.json @@ -0,0 +1,31 @@ +{ + "metrics": [ + { + "metric_name": "llm_rubric_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}" + }, + "rubrics": [ + { + "id": "conclusion", + "content": {"text": "å›žē­”é”»åŒ…å«ę˜Žē”®ēš„ē»“č®ŗęˆ–ę•°å€¼ļ¼ˆå¦‚ęø©åŗ¦ć€å¤©ę°”ēŠ¶å†µļ¼‰"}, + "description": "ē»“č®ŗę˜Žē”®", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "relevance", + "content": {"text": "å›žē­”é”»äøŽē”Øęˆ·é—®é¢˜ē›“ęŽ„ē›øå…³"}, + "description": "切题", + "type": "RELEVANCE" + } + ] + } + } + } + ] +} diff --git a/examples/evaluation/llm_rubric_response/test_llm_rubric_response.py b/examples/evaluation/llm_rubric_response/test_llm_rubric_response.py new file mode 100644 index 0000000..c8fd094 --- /dev/null +++ b/examples/evaluation/llm_rubric_response/test_llm_rubric_response.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""llm_rubric_response 评估器示例测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_llm_rubric_response_demo(): + """使用 llm_rubric_response ęŒ‡ę ‡čÆ„ęµ‹ Agentļ¼šč£åˆ¤ęŒ‰ rubrics é€ę”åˆ¤å®šęœ€ē»ˆå›žē­”ć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join( + test_dir, "agent", "llm_rubric_response.evalset.json" + ) + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="llm_rubric_response_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/pass_at_k/.env b/examples/evaluation/pass_at_k/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/pass_at_k/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/pass_at_k/README.md b/examples/evaluation/pass_at_k/README.md new file mode 100644 index 0000000..71b2bb6 --- /dev/null +++ b/examples/evaluation/pass_at_k/README.md @@ -0,0 +1,22 @@ +# pass@k / pass^k 示例 + +å¤šč½®čæč”ŒåŽč§£ęž (n, c)ļ¼Œč®”ē®— pass@1态pass@5态pass^2ć€‚ē›®å½•äøŽē”Øę³•ä»æē…§ [quickstart](../quickstart/)怂 + +## ē›®å½•ē»“ęž„ + +- `pass_at_k/`ļ¼šē¤ŗä¾‹ę ¹ē›®å½• +- `agent/`ļ¼šå†…å« `agent.py`态`weather_agent.evalset.json`态`config.py`态`test_config.json`ļ¼ˆå…¶äø­ **num_runs: 5**) +- `test_pass_at_k.py`ļ¼šä½æē”Ø get_executer态evaluate态get_result态parse_pass_nc态pass_at_k态pass_hat_k + +## ēŽÆå¢ƒč¦ę±‚ + +Python 3.10+ć€‚ēŽÆå¢ƒå˜é‡åŒ quickstart(`TRPC_AGENT_API_KEY` 等)。 + +## 运蔌 + +```bash +cd examples/evaluation/pass_at_k +pytest test_pass_at_k.py -v --tb=short -s +``` + +ē»ˆē«Æä¼šę‰“å°å„čÆ„ęµ‹é›†ēš„ n态c 仄及 pass@1态pass@5态pass^2怂 diff --git a/examples/evaluation/pass_at_k/agent/__init__.py b/examples/evaluation/pass_at_k/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/pass_at_k/agent/agent.py b/examples/evaluation/pass_at_k/agent/agent.py new file mode 100644 index 0000000..7f4c6a7 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/agent.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent for pass@k example.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + weather_data = { + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ"}, + "上海": {"temperature": 18, "condition": "å¤šäŗ‘"}, + "深圳": {"temperature": 25, "condition": "ꙓ"}, + "ę­å·ž": {"temperature": 20, "condition": "å°é›Ø"}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未矄"} + ) + return {"city": city, **result} + + +def create_agent() -> LlmAgent: + """Create the weather agent.""" + return LlmAgent( + name="weather_agent", + description="å¤©ę°”ęŸ„čÆ¢åŠ©ę‰‹", + model=_create_model(), + instruction="ä½ ę˜Æå¤©ę°”åŠ©ę‰‹ļ¼Œē”Ø get_weather ęŸ„čÆ¢åŸŽåø‚å¤©ę°”å¹¶ē®€č¦å›žē­”ć€‚", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/pass_at_k/agent/config.py b/examples/evaluation/pass_at_k/agent/config.py new file mode 100644 index 0000000..fc6bea3 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/pass_at_k/agent/test_config.json b/examples/evaluation/pass_at_k/agent/test_config.json new file mode 100644 index 0000000..447464b --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/test_config.json @@ -0,0 +1,28 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ], + "num_runs": 5 +} diff --git a/examples/evaluation/pass_at_k/agent/weather_agent.evalset.json b/examples/evaluation/pass_at_k/agent/weather_agent.evalset.json new file mode 100644 index 0000000..80f84b5 --- /dev/null +++ b/examples/evaluation/pass_at_k/agent/weather_agent.evalset.json @@ -0,0 +1,37 @@ +{ + "eval_set_id": "weather_agent_pass_at_k", + "name": "pass@k 示例评测集", + "description": "å•ē”Øä¾‹å¤šč½®čæč”Œļ¼Œē”ØäŗŽč®”ē®— pass@k / pass^k", + "eval_cases": [ + { + "eval_id": "simple_weather_001", + "conversation": [ + { + "invocation_id": "e-quick-001", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "18"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/pass_at_k/test_pass_at_k.py b/examples/evaluation/pass_at_k/test_pass_at_k.py new file mode 100644 index 0000000..d8987cd --- /dev/null +++ b/examples/evaluation/pass_at_k/test_pass_at_k.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""pass@k / pass^k ē¤ŗä¾‹ļ¼šå¤šč½®čæč”ŒåŽč®”ē®— pass@1态pass@5态pass^2怂""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_pass_at_k(): + """å¤šč½®čæč”ŒåŽč§£ęž (n, c)ļ¼Œč®”ē®— pass@k äøŽ pass^k怂""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "weather_agent.evalset.json") + + # test_config.json äø­å·²é…ē½® num_runs: 5ļ¼Œä¼šč·‘ 5 č½® + executer = AgentEvaluator.get_executer( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) + try: + await executer.evaluate() + finally: + result = executer.get_result() + if result is not None: + nc_by_set = AgentEvaluator.parse_pass_nc(result) + for eval_set_id, nc in nc_by_set.items(): + n, c = nc.n, nc.c + pass_1 = AgentEvaluator.pass_at_k(n, c, 1) + pass_5 = AgentEvaluator.pass_at_k(n, c, 5) + pass_hat_2 = AgentEvaluator.pass_hat_k(n, c, 2) + print( + f"EvalSet {eval_set_id}: n={n}, c={c}, " + f"pass@1={pass_1:.4f}, pass@5={pass_5:.4f}, pass^2={pass_hat_2:.4f}" + ) diff --git a/examples/evaluation/quickstart/.env b/examples/evaluation/quickstart/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/quickstart/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/quickstart/README.md b/examples/evaluation/quickstart/README.md new file mode 100644 index 0000000..6d709bd --- /dev/null +++ b/examples/evaluation/quickstart/README.md @@ -0,0 +1,27 @@ +# Quickstart 天气 Agent 示例 + +ęœ€å°čÆ„ęµ‹ē¤ŗä¾‹ļ¼šå¤©ę°”ęŸ„čÆ¢ Agentļ¼Œä»…åŒ…å«äø€äøŖ evalsetļ¼ˆå•ē”Øä¾‹ļ¼‰ć€‚ + +## ē›®å½•ē»“ęž„ + +- `quickstart/`ļ¼šē¤ŗä¾‹ę ¹ē›®å½• +- `agent/`ļ¼šå†…å« `agent.py`(`root_agent.name="weather_agent"`)、`weather_agent.evalset.json`态`config.py`态`test_config.json` ē­‰ + +## ēŽÆå¢ƒč¦ę±‚ + +Python 3.10+(建议 3.12) + +## ēŽÆå¢ƒå˜é‡ + +在 `.env` ęˆ–ēŽÆå¢ƒäø­č®¾ē½®ļ¼š + +- `TRPC_AGENT_API_KEY` ꈖ `API_KEY` +- `TRPC_AGENT_BASE_URL`ļ¼ˆåÆé€‰ļ¼‰ +- `TRPC_AGENT_MODEL_NAME`ļ¼ˆåÆé€‰ļ¼Œé»˜č®¤ glm-4.7) + +## čæč”ŒčÆ„ęµ‹ + +```bash +cd examples/evaluation/quickstart +pytest test_quickstart.py -v --tb=short -s +``` diff --git a/examples/evaluation/quickstart/agent/__init__.py b/examples/evaluation/quickstart/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/quickstart/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/quickstart/agent/agent.py b/examples/evaluation/quickstart/agent/agent.py new file mode 100644 index 0000000..39850fd --- /dev/null +++ b/examples/evaluation/quickstart/agent/agent.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Weather agent: current weather, forecast, AQI, UV index.""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def _create_model() -> OpenAIModel: + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + weather_data = { + "åŒ—äŗ¬": {"temperature": 15, "condition": "ꙓ", "humidity": 45, "wind_speed": 10}, + "上海": {"temperature": 18, "condition": "å¤šäŗ‘", "humidity": 60, "wind_speed": 15}, + "深圳": {"temperature": 25, "condition": "ꙓ", "humidity": 70, "wind_speed": 8}, + "ę­å·ž": {"temperature": 20, "condition": "å°é›Ø", "humidity": 85, "wind_speed": 12}, + } + result = weather_data.get( + city, {"temperature": 20, "condition": "未矄", "humidity": 50, "wind_speed": 10} + ) + return {"city": city, **result} + + +def get_weather_forecast(city: str, days: int = 3) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚ęœŖę„å‡ ę—„å¤©ę°”é¢„ęŠ„ć€‚""" + return { + "city": city, + "forecast": [{"date": "today", "temperature": "20°C", "condition": "ꙓ"}] * days, + } + + +def get_air_quality(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚ē©ŗę°”č“Øé‡ć€‚""" + aqi_data = {"åŒ—äŗ¬": 85, "上海": 72, "深圳": 65, "ę­å·ž": 90, "å¹æå·ž": 78} + aqi = aqi_data.get(city, 75) + level = "优" if aqi <= 50 else "良" if aqi <= 100 else "č½»åŗ¦ę±”ęŸ“" + return {"city": city, "aqi": aqi, "level": level} + + +def get_uv_index(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚ē“«å¤–ēŗæęŒ‡ę•°ć€‚""" + uv_data = {"åŒ—äŗ¬": 5, "上海": 6, "深圳": 8, "ę­å·ž": 4, "å¹æå·ž": 7} + uv = uv_data.get(city, 5) + suggestion = "ę³Øę„é˜²ę™’" if uv >= 6 else "é€‚å®œęˆ·å¤–" + return {"city": city, "uv_index": uv, "suggestion": suggestion} + + +def create_agent() -> LlmAgent: + """Create the weather agent.""" + return LlmAgent( + name="weather_agent", + description="å¤©ę°”ęŸ„čÆ¢åŠ©ę‰‹ļ¼ŒåÆęŸ„å½“å‰å¤©ę°”ć€é¢„ęŠ„ć€ē©ŗę°”č“Øé‡ć€ē“«å¤–ēŗæęŒ‡ę•°", + model=_create_model(), + instruction=( + "ä½ ę˜Æå¤©ę°”åŠ©ę‰‹ć€‚ē”Ø get_weather ęŸ„å½“å‰å¤©ę°”ļ¼Œget_weather_forecast ęŸ„é¢„ęŠ„ļ¼Œ" + "get_air_quality ęŸ„ē©ŗę°”č“Øé‡ļ¼Œget_uv_index ęŸ„ē“«å¤–ēŗæć€‚" + "ē”Øęˆ·é—®å¤šē±»äæ”ęÆę—¶ä¾ę¬”č°ƒē”Øē›øåŗ”å·„å…·ć€‚" + ), + tools=[ + FunctionTool(get_weather), + FunctionTool(get_weather_forecast), + FunctionTool(get_air_quality), + FunctionTool(get_uv_index), + ], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/quickstart/agent/config.py b/examples/evaluation/quickstart/agent/config.py new file mode 100644 index 0000000..fc6bea3 --- /dev/null +++ b/examples/evaluation/quickstart/agent/config.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/quickstart/agent/test_config.json b/examples/evaluation/quickstart/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/quickstart/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/quickstart/agent/weather_agent.evalset.json b/examples/evaluation/quickstart/agent/weather_agent.evalset.json new file mode 100644 index 0000000..d3a78a2 --- /dev/null +++ b/examples/evaluation/quickstart/agent/weather_agent.evalset.json @@ -0,0 +1,47 @@ +{ + "eval_set_id": "weather_agent_quickstart", + "name": "天气 Agent å•ē”Øä¾‹", + "description": "Quickstart å•č½®å¤©ę°”ęŸ„čÆ¢čÆ„ęµ‹", + "eval_cases": [ + { + "eval_id": "simple_weather_001", + "conversation": [ + { + "invocation_id": "e-quick-001", + "user_content": { + "parts": [ + { + "text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "18°C" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": { + "city": "上海" + } + } + ] + } + } + ], + "session_input": { + "app_name": "weather_agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/quickstart/test_quickstart.py b/examples/evaluation/quickstart/test_quickstart.py new file mode 100644 index 0000000..2fac302 --- /dev/null +++ b/examples/evaluation/quickstart/test_quickstart.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Quickstart 天气 Agent 评测测试""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_quickstart_with_eval_set(): + """ä½æē”Øå•äøŖ evalset 测试 quickstart 天气 Agent""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "weather_agent.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/trace_mode/.env b/examples/evaluation/trace_mode/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/trace_mode/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/trace_mode/README.md b/examples/evaluation/trace_mode/README.md new file mode 100644 index 0000000..3b1ce63 --- /dev/null +++ b/examples/evaluation/trace_mode/README.md @@ -0,0 +1,20 @@ +# Trace ęØ”å¼ē¤ŗä¾‹ + +使用 **eval_mode: "trace"**ļ¼šäøč°ƒē”Ø Agent ęŽØē†ļ¼Œē”Ø evalset äø­ēš„ **actual_conversation** ä½œäøŗć€Œå®žé™…č½Øčæ¹ć€å‚äøŽę‰“åˆ†ļ¼Œ**conversation** ä½œäøŗé¢„ęœŸē”ØäŗŽåÆ¹ęÆ”ć€‚é€‚åˆå›žę”¾å·²ęœ‰åÆ¹čÆć€ē¦»ēŗæčÆ„ä¼°ć€‚ + +## ē›®å½•ē»“ęž„ + +- `trace_mode/`ļ¼šē¤ŗä¾‹ę ¹ē›®å½• +- `agent/`ļ¼šå†…å« `agent.py`态`trace_example.evalset.json`(含 trace 用例)、`test_config.json`态`config.py` +- `test_trace_mode.py`ļ¼šč°ƒē”Ø `AgentEvaluator.evaluate`ļ¼Œä»…ę‰§č”Œę‰“åˆ†é˜¶ę®µ + +## ēŽÆå¢ƒč¦ę±‚ + +Python 3.10+怂Trace ęØ”å¼äøč·‘ęØ”åž‹ęŽØē†ļ¼Œä½†ę”†ęž¶ä»ä¼šåŠ č½½ agent ęØ”å—ļ¼›č‹„ęœŖé…ē½® `TRPC_AGENT_API_KEY`ļ¼ŒåŠ č½½åÆčƒ½ęŠ„é”™ļ¼ŒåÆęŒ‰éœ€é…ē½®ęˆ–ä»…ē”Øę–¼ęŸ„ēœ‹ēµę§‹ć€‚ + +## 运蔌 + +```bash +cd examples/evaluation/trace_mode +pytest test_trace_mode.py -v --tb=short -s +``` diff --git a/examples/evaluation/trace_mode/agent/__init__.py b/examples/evaluation/trace_mode/agent/__init__.py new file mode 100644 index 0000000..45706f2 --- /dev/null +++ b/examples/evaluation/trace_mode/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/trace_mode/agent/agent.py b/examples/evaluation/trace_mode/agent/agent.py new file mode 100644 index 0000000..c9304fb --- /dev/null +++ b/examples/evaluation/trace_mode/agent/agent.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Minimal agent for trace mode example (in trace mode the agent is not run).""" + +from typing import Any, Dict + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool + +from .config import get_model_config + + +def get_weather(city: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęŒ‡å®šåŸŽåø‚å½“å‰å¤©ę°”ć€‚""" + return {"city": city, "temperature": 18, "condition": "å¤šäŗ‘"} + + +def create_agent() -> LlmAgent: + api_key, url, model_name = get_model_config() + return LlmAgent( + name="weather_agent", + description="å¤©ę°”åŠ©ę‰‹", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=url), + instruction="用 get_weather ęŸ„å¤©ę°”å¹¶ē®€č¦å›žē­”ć€‚", + tools=[FunctionTool(get_weather)], + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/trace_mode/agent/config.py b/examples/evaluation/trace_mode/agent/config.py new file mode 100644 index 0000000..71081cc --- /dev/null +++ b/examples/evaluation/trace_mode/agent/config.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config (trace ęØ”å¼äøč·‘ęŽØē†ļ¼Œä»…åŠ č½½ęØ”å—ę—¶åÆčƒ½ē”Øåˆ°).""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get("TRPC_AGENT_MODEL_NAME", "glm-4.7") + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/trace_mode/agent/test_config.json b/examples/evaluation/trace_mode/agent/test_config.json new file mode 100644 index 0000000..c6824ba --- /dev/null +++ b/examples/evaluation/trace_mode/agent/test_config.json @@ -0,0 +1,27 @@ +{ + "metrics": [ + { + "metric_name": "tool_trajectory_avg_score", + "threshold": 0.8, + "criterion": { + "tool_trajectory": { + "default": { + "name": {"match": "exact", "case_insensitive": false}, + "arguments": {"match": "exact"} + }, + "order_sensitive": false, + "subset_matching": false + } + } + }, + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + } + ] +} diff --git a/examples/evaluation/trace_mode/agent/trace_example.evalset.json b/examples/evaluation/trace_mode/agent/trace_example.evalset.json new file mode 100644 index 0000000..c522258 --- /dev/null +++ b/examples/evaluation/trace_mode/agent/trace_example.evalset.json @@ -0,0 +1,55 @@ +{ + "eval_set_id": "trace_mode_example", + "name": "Trace ęØ”å¼ē¤ŗä¾‹", + "description": "ä½æē”Øé¢„å½•åˆ¶ēš„ actual_conversation ä½œäøŗå®žé™…č½Øčæ¹ļ¼Œäøč°ƒē”Ø Agent ęŽØē†", + "eval_cases": [ + { + "eval_id": "replay_001", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "inv-1", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "äøŠęµ·ä»Šå¤©å¤šäŗ‘ļ¼Œ18°C"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ], + "conversation": [ + { + "invocation_id": "exp-1", + "user_content": { + "parts": [{"text": "äøŠęµ·å¤©ę°”ę€Žä¹ˆę ·"}], + "role": "user" + }, + "final_response": { + "parts": [{"text": "äøŠęµ·ä»Šå¤©å¤šäŗ‘ļ¼Œ18°C"}], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "t1", + "name": "get_weather", + "args": {"city": "上海"} + } + ] + } + } + ] + } + ] +} diff --git a/examples/evaluation/trace_mode/test_trace_mode.py b/examples/evaluation/trace_mode/test_trace_mode.py new file mode 100644 index 0000000..06235bb --- /dev/null +++ b/examples/evaluation/trace_mode/test_trace_mode.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Trace ęØ”å¼ē¤ŗä¾‹ļ¼šä½æē”Øé¢„å½•åˆ¶ēš„åÆ¹čÆč½Øčæ¹å‚äøŽę‰“åˆ†ļ¼Œäøč°ƒē”Ø Agent ęŽØē†ć€‚""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_trace_mode(): + """Trace ęØ”å¼ļ¼šč·³čæ‡ęŽØē†ļ¼Œē”Ø evalset äø­ēš„ actual_conversation ä½œäøŗå®žé™…č½Øčæ¹å‚äøŽčÆ„ä¼°ć€‚""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "trace_example.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + agent_name="weather_agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) diff --git a/examples/evaluation/webui/.env b/examples/evaluation/webui/.env new file mode 100644 index 0000000..d47b675 --- /dev/null +++ b/examples/evaluation/webui/.env @@ -0,0 +1 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME diff --git a/examples/evaluation/webui/README.md b/examples/evaluation/webui/README.md new file mode 100644 index 0000000..fea2ba3 --- /dev/null +++ b/examples/evaluation/webui/README.md @@ -0,0 +1,36 @@ +# WebUI ä¹¦ē±ęŸ„ę‰¾ Agent 示例 + +ęŒ‰ä¼˜å…ˆēŗ§ęŸ„ę‰¾ä¹¦ē±ļ¼šęœ¬åœ°å›¾ä¹¦é¦† → ęœ¬åœ°ä¹¦åŗ— → åœØēŗæé›¶å”®å•†ć€‚ + +## ē›®å½•ē»“ęž„ + +- `webui/`:`--agents` ęŒ‡å‘ę­¤ē›®å½• +- `agent/`ļ¼šå­ē›®å½•åé”»äøŽ `root_agent.name` äø€č‡“ļ¼ˆ`"agent"`ļ¼‰ļ¼Œå†…å« `agent.py`态`agent.evalset.json`态`config.py`态`prompts.py`态`tools.py`态`test_config.json` ē­‰ + +## ēŽÆå¢ƒč¦ę±‚ + +Python 3.10+(建议 3.12) + +## ēŽÆå¢ƒå˜é‡ + +在 `.env` äø­ęˆ–é€ščæ‡ `export` 设置: + +- `TRPC_AGENT_API_KEY` ꈖ `API_KEY` +- `TRPC_AGENT_BASE_URL`ļ¼ˆåÆé€‰ļ¼Œęœ‰é»˜č®¤å€¼ļ¼‰ +- `TRPC_AGENT_MODEL_NAME`ļ¼ˆåÆé€‰ļ¼Œęœ‰é»˜č®¤å€¼ļ¼‰ + +## čæč”Œē¤ŗä¾‹ + +```bash +cd examples/evaluation/webui +python run_agent.py +``` + +## čæč”ŒčÆ„ä¼°ęµ‹čÆ• + +```bash +cd examples/evaluation/webui +pytest test_book_finder.py -v --tb=short -s +``` + +éœ€å·²č®¾ē½®äøŠčæ°ēŽÆå¢ƒå˜é‡ć€‚ diff --git a/examples/evaluation/webui/agent/__init__.py b/examples/evaluation/webui/agent/__init__.py new file mode 100644 index 0000000..449d379 --- /dev/null +++ b/examples/evaluation/webui/agent/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2026 Tencent.com +from .agent import root_agent + +__all__ = ["root_agent"] diff --git a/examples/evaluation/webui/agent/agent.evalset.json b/examples/evaluation/webui/agent/agent.evalset.json new file mode 100644 index 0000000..31dc1d5 --- /dev/null +++ b/examples/evaluation/webui/agent/agent.evalset.json @@ -0,0 +1,101 @@ +{ + "eval_set_id": "agent", + "name": "ä¹¦ē±ęŸ„ę‰¾å·„ä½œęµčÆ„ä¼°", + "description": "ęµ‹čÆ•ä¹¦ē±ęŸ„ę‰¾ Agent åœØäøåŒåœŗę™Æäø‹ēš„č”Œäøŗå’Œå“åŗ”č“Øé‡", + "eval_cases": [ + { + "eval_id": "session_001_library_available", + "conversation": [ + { + "invocation_id": "e-book-001-001", + "user_content": { + "parts": [ + { + "text": "ęˆ‘ęƒ³å€Ÿé˜…ć€Šäø‰ä½“ć€‹" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "å¤Ŗå„½äŗ†ļ¼ęˆ‘ę‰¾åˆ°äŗ†ć€Šäø‰ä½“ć€‹čæ™ęœ¬ä¹¦ć€‚ęœ¬åœ°å›¾ä¹¦é¦†ęœ‰3ęœ¬åÆå€Ÿļ¼Œä½äŗŽē§‘ęŠ€å›¾ä¹¦é¦†ēš„ē§‘å¹»å°čÆ“åŒŗ A-301ć€‚å›¾ä¹¦é¦†å€Ÿé˜…ę˜Æå…č“¹ēš„ļ¼ŒęŽØčę‚Øē›“ęŽ„åŽ»å€Ÿé˜…ļ¼" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "tool-call-001", + "name": "search_local_library", + "args": { + "book_title": "三体" + } + } + ] + } + } + ], + "session_input": { + "app_name": "agent", + "user_id": "user", + "state": {} + } + }, + { + "eval_id": "session_002_only_online_available", + "conversation": [ + { + "invocation_id": "e-book-002-001", + "user_content": { + "parts": [ + { + "text": "ęˆ‘ęƒ³ę‰¾ć€Šäŗŗå·„ę™ŗčƒ½ē®€å²ć€‹" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "text": "å…³äŗŽć€Šäŗŗå·„ę™ŗčƒ½ē®€å²ć€‹ēš„ęŸ„ę‰¾ē»“ęžœå¦‚äø‹ļ¼š\n\n1. **ęœ¬åœ°å›¾ä¹¦é¦†**\n - å¾ˆęŠ±ę­‰ļ¼Œęœ¬åœ°å›¾ä¹¦é¦†ē›®å‰ę²”ęœ‰čæ™ęœ¬ä¹¦ć€‚\n\n2. **ęœ¬åœ°ä¹¦åŗ—**\n - å¾ˆęŠ±ę­‰ļ¼Œęœ¬åœ°ä¹¦åŗ—ä¹Ÿę²”ęœ‰ę‰¾åˆ°čæ™ęœ¬ä¹¦ć€‚\n\n3. **在线蓭买**\n - ę‚ØåÆä»„é€‰ę‹©ä»„äø‹å¹³å°č“­ä¹°ļ¼š\n - äŗ¬äøœå›¾ä¹¦ļ¼š[é“¾ęŽ„](https://book.jd.com)\n - å½“å½“ē½‘ļ¼š[é“¾ęŽ„](https://book.dangdang.com)\n - äŗšé©¬é€Šäø­å›½ļ¼š[é“¾ęŽ„](https://www.amazon.cn)\n - å»ŗč®®ę‚Øå…ˆęœē“¢ęÆ”ä»·ļ¼Œé€‰ę‹©ęœ€ä¼˜ęƒ ēš„å¹³å°č“­ä¹°ć€‚" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": "tool-call-002-1", + "name": "search_local_library", + "args": { + "book_title": "äŗŗå·„ę™ŗčƒ½ē®€å²" + } + }, + { + "id": "tool-call-002-2", + "name": "find_local_bookstore", + "args": { + "book_title": "äŗŗå·„ę™ŗčƒ½ē®€å²" + } + }, + { + "id": "tool-call-002-3", + "name": "order_online", + "args": { + "book_title": "äŗŗå·„ę™ŗčƒ½ē®€å²" + } + } + ] + } + } + ], + "session_input": { + "app_name": "agent", + "user_id": "user", + "state": {} + } + } + ] +} diff --git a/examples/evaluation/webui/agent/agent.py b/examples/evaluation/webui/agent/agent.py new file mode 100644 index 0000000..4b2b7bd --- /dev/null +++ b/examples/evaluation/webui/agent/agent.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Book finder agent: local library, bookstore, online retailers.""" + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.tools import FunctionTool +from trpc_agent_sdk.types import GenerateContentConfig + +from .config import get_model_config +from .prompts import INSTRUCTION +from .tools import find_local_bookstore, order_online, search_local_library + + +def _create_model() -> OpenAIModel: + """Create model from config.""" + api_key, url, model_name = get_model_config() + return OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + + +def create_agent() -> LlmAgent: + """Create the book finder agent.""" + return LlmAgent( + name="agent", + description="äø“äøšēš„ä¹¦ē±ęŸ„ę‰¾åŠ©ę‰‹ļ¼ŒåÆęŸ„čÆ¢ęœ¬åœ°å›¾ä¹¦é¦†ć€ä¹¦åŗ—å’ŒåœØēŗæé›¶å”®å•†", + model=_create_model(), + instruction=INSTRUCTION, + tools=[ + FunctionTool(search_local_library), + FunctionTool(find_local_bookstore), + FunctionTool(order_online), + ], + generate_content_config=GenerateContentConfig( + temperature=0.3, + top_p=0.9, + max_output_tokens=1500, + ), + ) + + +root_agent = create_agent() diff --git a/examples/evaluation/webui/agent/config.py b/examples/evaluation/webui/agent/config.py new file mode 100644 index 0000000..728c1bb --- /dev/null +++ b/examples/evaluation/webui/agent/config.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Agent config module.""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables.""" + api_key = ( + os.environ.get("TRPC_AGENT_API_KEY") or os.environ.get("API_KEY", "") + ) + url = os.environ.get( + "TRPC_AGENT_BASE_URL", "http://v2.open.venus.woa.com/llmproxy" + ) + model_name = os.environ.get( + "TRPC_AGENT_MODEL_NAME", "deepseek-v3-local-II" + ) + if not api_key: + raise ValueError( + "TRPC_AGENT_API_KEY or API_KEY must be set in environment variables" + ) + return api_key, url, model_name diff --git a/examples/evaluation/webui/agent/prompts.py b/examples/evaluation/webui/agent/prompts.py new file mode 100644 index 0000000..490fbda --- /dev/null +++ b/examples/evaluation/webui/agent/prompts.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Prompts for the book finder agent.""" + +INSTRUCTION = """ä½ ę˜Æäø€äøŖäø“äøšēš„ä¹¦ē±ęŸ„ę‰¾åŠ©ę‰‹ļ¼Œåø®åŠ©ē”Øęˆ·åœØęœ¬åœ°ęˆ–åœØēŗæę‰¾åˆ°ä»–ä»¬ęƒ³č¦ēš„ä¹¦ē±ć€‚ + +### å·„ä½œęµēØ‹ + +**ę­„éŖ¤ 1: 理解请求** +- ä»Žē”Øęˆ·ēš„čÆ·ę±‚äø­ęå–ä¹¦ē±ę ‡é¢˜ +- å¦‚ęžœē”Øęˆ·ęœŖęä¾›å…·ä½“ę ‡é¢˜ļ¼Œē¤¼č²Œåœ°čÆ¢é—®ä»–ä»¬ęƒ³č¦ęŸ„ę‰¾ēš„ä¹¦ē±åē§° + +**ę­„éŖ¤ 2: ęŒ‰ä¼˜å…ˆēŗ§é”ŗåŗęŸ„ę‰¾ä¹¦ē±** + +1. **é¦–å…ˆę£€ęŸ„ęœ¬åœ°å›¾ä¹¦é¦†** (使用 search_local_library å·„å…·) + - å¦‚ęžœęœ‰åÆē”Øå‰Æęœ¬ļ¼Œęä¾›čÆ¦ē»†äæ”ęÆļ¼šåˆ†é¦†ä½ē½®ć€å‰Æęœ¬ę•°é‡ć€å…·ä½“ä½ē½® + - å¼ŗč°ƒå›¾ä¹¦é¦†å€Ÿé˜…ę˜Æå…č“¹ēš„ + +2. **ē„¶åŽę£€ęŸ„ęœ¬åœ°ä¹¦åŗ—** (使用 find_local_bookstore å·„å…·) + - å¦‚ęžœå›¾ä¹¦é¦†ę²”ęœ‰ļ¼ŒęŸ„ę‰¾ęœ¬åœ°ä¹¦åŗ—åŗ“å­˜ + - ęä¾›ä¹¦åŗ—åœ°å€ć€ē”µčÆć€ä»·ę ¼å’Œåŗ“å­˜ę•°é‡ + - å»ŗč®®ē”Øęˆ·åÆä»„å®žåœ°ęŸ„ēœ‹ + +3. **ęœ€åŽę£€ęŸ„åœØēŗæé›¶å”®å•†** (使用 order_online å·„å…·) + - å¦‚ęžœęœ¬åœ°čµ„ęŗéƒ½ę²”ęœ‰ļ¼Œęä¾›åœØēŗæč“­ä¹°é€‰é”¹ + - åˆ—å‡ŗå¤šäøŖå¹³å°ä¾›ē”Øęˆ·é€‰ę‹© + - ęä¾›é…é€ę—¶é—“å‚č€ƒ + +**ę­„éŖ¤ 3: å‹å„½å‘ˆēŽ°ē»“ęžœ** +- ē”Øęø…ę™°ć€å‹å„½ēš„čÆ­čØ€ę€»ē»“ęŸ„ę‰¾ē»“ęžœ +- å¦‚ęžœå¤šäøŖęø é“éƒ½ęœ‰ļ¼ŒęŒ‰ä¼˜å…ˆēŗ§ęŽØč +- å¦‚ęžœéƒ½ę²”ęœ‰ļ¼Œå»ŗč®®ē”Øęˆ·å°čÆ•ęœē“¢ē›øä¼¼ä¹¦ē±ęˆ–č”ē³»ä¹¦åŗ—é¢„č®¢ + +### ę³Øę„äŗ‹é”¹ +- äæęŒå‹å„½å’Œäø“äøšēš„ę€åŗ¦ +- ęä¾›å‡†ē”®ć€čÆ¦ē»†ēš„äæ”ęÆ +- ä¼˜å…ˆęŽØčęœ¬åœ°čµ„ęŗļ¼ˆå›¾ä¹¦é¦†å’Œä¹¦åŗ—ļ¼‰ +- åø®åŠ©ē”Øęˆ·åšå‡ŗęœ€ä½³é€‰ę‹© +""" diff --git a/examples/evaluation/webui/agent/test_config.json b/examples/evaluation/webui/agent/test_config.json new file mode 100644 index 0000000..49ca8d4 --- /dev/null +++ b/examples/evaluation/webui/agent/test_config.json @@ -0,0 +1,6 @@ +{ + "criteria": { + "tool_trajectory_avg_score": 0.5, + "response_match_score": 0.3 + } +} diff --git a/examples/evaluation/webui/agent/tools.py b/examples/evaluation/webui/agent/tools.py new file mode 100644 index 0000000..f393da7 --- /dev/null +++ b/examples/evaluation/webui/agent/tools.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""Tools for the book finder agent.""" + +from typing import Any, Dict + + +def search_local_library(book_title: str) -> Dict[str, Any]: + """ęŸ„čÆ¢ęœ¬åœ°å›¾ä¹¦é¦†ēš„ä¹¦ē±åÆē”Øę€§ļ¼ŒåŒ…ę‹¬å‰Æęœ¬ę•°é‡ć€ę‰€åœØåˆ†é¦†å’Œä½ē½®äæ”ęÆć€‚ + + å‚ę•°: + book_title: ä¹¦ē±ę ‡é¢˜ + + čæ”å›ž: + åŒ…å«ä¹¦ē±åÆē”Øę€§äæ”ęÆēš„å­—å…ø + """ + library_data = { + "Heartstopper: Volume 1": { + "available": False, + "copies": 0, + "branch": "中央图书馆", + "status": "ę‰€ęœ‰å‰Æęœ¬éƒ½å·²å€Ÿå‡ŗ", + }, + "三体": { + "available": True, + "copies": 3, + "branch": "ē§‘ęŠ€å›¾ä¹¦é¦†", + "status": "꜉ 3 ęœ¬åÆå€Ÿ", + "location": "ē§‘å¹»å°čÆ“åŒŗ A-301", + }, + "ę“»ē€": { + "available": True, + "copies": 2, + "branch": "文学图书馆", + "status": "꜉ 2 ęœ¬åÆå€Ÿ", + "location": "ēŽ°ä»£ę–‡å­¦åŒŗ B-205", + }, + "Pythonē¼–ēØ‹ļ¼šä»Žå…„é—Øåˆ°å®žč·µ": { + "available": True, + "copies": 1, + "branch": "ęŠ€ęœÆå›¾ä¹¦é¦†", + "status": "仅剩 1 ęœ¬åÆå€Ÿ", + "location": "ē¼–ēØ‹čÆ­čØ€åŒŗ C-102", + }, + } + + result = library_data.get( + book_title, + { + "available": False, + "copies": 0, + "branch": "ęœŖę‰¾åˆ°", + "status": "ęœ¬åœ°å›¾ä¹¦é¦†ę²”ęœ‰ę­¤ä¹¦", + }, + ) + + return { + "source": "ęœ¬åœ°å›¾ä¹¦é¦†", + "book_title": book_title, + **result, + } + + +def find_local_bookstore(book_title: str) -> Dict[str, Any]: + """ęŸ„ę‰¾ęœ¬åœ°ä¹¦åŗ—ēš„ä¹¦ē±åŗ“å­˜äæ”ęÆļ¼ŒåŒ…ę‹¬ä»·ę ¼ć€åœ°å€å’Œč”ē³»ę–¹å¼ć€‚ + + å‚ę•°: + book_title: ä¹¦ē±ę ‡é¢˜ + + čæ”å›ž: + åŒ…å«ä¹¦åŗ—åŗ“å­˜äæ”ęÆēš„å­—å…ø + """ + bookstore_data = { + "Heartstopper: Volume 1": { + "available": True, + "price": "49.80元", + "store": "ę–°åŽä¹¦åŗ—", + "address": "åø‚äø­åæƒę­„č”Œč”—128号", + "phone": "0755-12345678", + "stock": 5, + }, + "三体": { + "available": True, + "price": "23.00元", + "store": "å½“å½“å®žä½“ä¹¦åŗ—", + "address": "ē§‘ęŠ€å›­åŒŗåˆ›äøšč·Æ56号", + "phone": "0755-87654321", + "stock": 10, + }, + "č§£åæ§ę‚č“§åŗ—": { + "available": True, + "price": "39.50元", + "store": "方所书店", + "address": "č“­ē‰©äø­åæƒ3愼", + "phone": "0755-11223344", + "stock": 3, + }, + } + + result = bookstore_data.get( + book_title, + { + "available": False, + "store": "ęœŖę‰¾åˆ°", + "status": "ęœ¬åœ°ä¹¦åŗ—ęš‚ę— åŗ“å­˜", + }, + ) + + return { + "source": "ęœ¬åœ°ä¹¦åŗ—", + "book_title": book_title, + **result, + } + + +def order_online(book_title: str) -> Dict[str, Any]: + """ęŸ„ę‰¾åœØēŗæč“­ä¹°ä¹¦ē±ēš„é€‰é”¹ļ¼Œęä¾›å¤šäøŖē”µå•†å¹³å°ēš„äæ”ęÆå’Œé…é€ę—¶é—“ć€‚ + + å‚ę•°: + book_title: ä¹¦ē±ę ‡é¢˜ + + čæ”å›ž: + åŒ…å«åœØēŗæč“­ä¹°äæ”ęÆēš„å­—å…ø + """ + online_data = { + "default": { + "available": True, + "platforms": [ + { + "name": "äŗ¬äøœå›¾ä¹¦", + "price": "ę ¹ę®ä¹¦ē±č€Œå®š", + "url": "https://book.jd.com", + "delivery": "ę¬”ę—„č¾¾ļ¼ˆä¼šå‘˜ļ¼‰", + }, + { + "name": "当当网", + "price": "ę ¹ę®ä¹¦ē±č€Œå®š", + "url": "https://book.dangdang.com", + "delivery": "2-3äøŖå·„ä½œę—„", + }, + { + "name": "äŗšé©¬é€Šäø­å›½", + "price": "ę ¹ę®ä¹¦ē±č€Œå®š", + "url": "https://www.amazon.cn", + "delivery": "2-5äøŖå·„ä½œę—„", + }, + ], + "recommendation": "å»ŗč®®å…ˆęœē“¢ęÆ”ä»·ļ¼Œé€‰ę‹©ęœ€ä¼˜ęƒ ēš„å¹³å°č“­ä¹°", + } + } + + result = online_data["default"] + + return { + "source": "åœØēŗæé›¶å”®å•†", + "book_title": book_title, + **result, + } diff --git a/examples/evaluation/webui/test_book_finder.py b/examples/evaluation/webui/test_book_finder.py new file mode 100644 index 0000000..31e5380 --- /dev/null +++ b/examples/evaluation/webui/test_book_finder.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright @ 2025 Tencent.com +"""ä¹¦ē±ęŸ„ę‰¾ Agent 测试 - 仿照 ADK AgentEvaluator""" + +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + + +@pytest.mark.asyncio +async def test_webui_with_eval_set(): + """使用评估集测试 WebUI ä¹¦ē±ęŸ„ę‰¾ Agent""" + test_dir = os.path.dirname(os.path.abspath(__file__)) + eval_set_path = os.path.join(test_dir, "agent", "agent.evalset.json") + + await AgentEvaluator.evaluate( + agent_module="agent", + eval_dataset_file_path_or_dir=eval_set_path, + print_detailed_results=True, + ) From cf5e9778c7f9f304a2e3763b6dc37087b369d235 Mon Sep 17 00:00:00 2001 From: mjn96 Date: Wed, 20 May 2026 17:55:08 +0800 Subject: [PATCH 3/7] feat(tools): support GraphAgent in AgentTool Add GraphAgent support in _run_async_impl so that when the wrapped agent is a GraphAgent, the tool result is retrieved from tool_context.state using STATE_KEY_LAST_RESPONSE. --- trpc_agent_sdk/tools/_agent_tool.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/trpc_agent_sdk/tools/_agent_tool.py b/trpc_agent_sdk/tools/_agent_tool.py index c45f09f..78ba4ad 100644 --- a/trpc_agent_sdk/tools/_agent_tool.py +++ b/trpc_agent_sdk/tools/_agent_tool.py @@ -136,6 +136,7 @@ async def _run_async_impl( ) -> Any: try: from trpc_agent_sdk.agents import LlmAgent + from trpc_agent_sdk.dsl.graph import GraphAgent, STATE_KEY_LAST_RESPONSE if self.skip_summarization: tool_context.event_actions.skip_summarization = True if isinstance(self.agent, LlmAgent) and self.agent.input_schema: @@ -206,6 +207,8 @@ async def _run_async_impl( merged_text = '\n'.join([p.text for p in last_event.content.parts if p.text]) repaired = json_repair_string(merged_text) tool_result = self.agent.output_schema.model_validate_json(repaired).model_dump(exclude_none=True) + elif isinstance(self.agent, GraphAgent): + tool_result = tool_context.state.get(STATE_KEY_LAST_RESPONSE, '') else: tool_result = '\n'.join([p.text for p in last_event.content.parts if p.text]) return tool_result From f49692a30baed93025cd560a1a38839bfbbefde7 Mon Sep 17 00:00:00 2001 From: raychen <815315825@qq.com> Date: Tue, 26 May 2026 14:16:37 +0800 Subject: [PATCH 4/7] =?UTF-8?q?feature:=20mcp=20tool=E5=8A=A0=E4=B8=8A?= =?UTF-8?q?=E7=BC=93=E5=AD=98=E9=81=BF=E5=85=8D=E5=A4=9A=E6=AC=A1=E7=BD=91?= =?UTF-8?q?=E7=BB=9C=E8=AE=BF=E9=97=AE=20(#71)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- tests/tools/mcp_tool/test_mcp_toolset.py | 197 ++++++++++++++++++ trpc_agent_sdk/tools/mcp_tool/_mcp_toolset.py | 102 ++++++++- 3 files changed, 297 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6c47d4d..b986bed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ dependencies = [ "pydantic>=2.11.3", "openai>=1.3.0", - "mcp>=1.10.1", + "mcp<1.23.4,>=1.10.1", "aiohttp", "httpx>=0.27.0", "httpx-sse>=0.4.0", diff --git a/tests/tools/mcp_tool/test_mcp_toolset.py b/tests/tools/mcp_tool/test_mcp_toolset.py index 2a864ec..a825701 100644 --- a/tests/tools/mcp_tool/test_mcp_toolset.py +++ b/tests/tools/mcp_tool/test_mcp_toolset.py @@ -4,9 +4,11 @@ # # tRPC-Agent-Python is licensed under Apache-2.0. +import asyncio from unittest.mock import AsyncMock, MagicMock, patch import pytest +from mcp import types as mcp_types from mcp import StdioServerParameters as McpStdioServerParameters from mcp.types import ListToolsResult, Tool as McpBaseTool @@ -26,6 +28,13 @@ def _stdio_conn(): ) +def _server_capabilities(list_changed: bool | None = None): + tools_capability = None + if list_changed is not None: + tools_capability = mcp_types.ToolsCapability(listChanged=list_changed) + return mcp_types.ServerCapabilities(tools=tools_capability) + + # --------------------------------------------------------------------------- # Tests: __init__ # --------------------------------------------------------------------------- @@ -70,6 +79,15 @@ def test_session_group_params_custom(self): ts = MCPToolset(connection_params=_stdio_conn(), session_group_params={"key": "val"}) assert ts._session_group_params == {"key": "val"} + def test_tools_cache_enabled_by_default(self): + ts = MCPToolset(connection_params=_stdio_conn()) + assert ts._cache_tools is True + assert ts._tools_cache_ttl == 60.0 + + def test_rejects_negative_tools_cache_ttl(self): + with pytest.raises(ValueError, match="tools_cache_ttl must be non-negative"): + MCPToolset(connection_params=_stdio_conn(), tools_cache_ttl=-1) + # --------------------------------------------------------------------------- # Tests: _checker_required_params @@ -294,6 +312,185 @@ async def test_get_tools_with_custom_mcp_tool_cls(self): custom_cls.assert_called_once() assert len(tools) == 1 + @pytest.mark.asyncio + async def test_get_tools_reuses_cached_list_tools_response(self): + ts = MCPToolset(connection_params=_stdio_conn()) + + mock_mgr = MagicMock(spec=MCPSessionManager) + mock_session = AsyncMock() + mock_mgr.create_session = AsyncMock(return_value=mock_session) + + mcp_tools = [ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ] + mock_session.list_tools = AsyncMock(return_value=ListToolsResult(tools=mcp_tools)) + + with patch.object(ts, "initialize"): + ts._mcp_session_manager = mock_mgr + first = await ts.get_tools() + second = await ts.get_tools() + + assert [tool.name for tool in first] == ["tool_a"] + assert [tool.name for tool in second] == ["tool_a"] + mock_session.list_tools.assert_awaited_once() + + @pytest.mark.asyncio + async def test_get_tools_can_disable_tools_cache(self): + ts = MCPToolset(connection_params=_stdio_conn(), cache_tools=False) + + mock_mgr = MagicMock(spec=MCPSessionManager) + mock_session = AsyncMock() + mock_mgr.create_session = AsyncMock(return_value=mock_session) + + mock_session.list_tools = AsyncMock( + return_value=ListToolsResult( + tools=[ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ] + )) + + with patch.object(ts, "initialize"): + ts._mcp_session_manager = mock_mgr + await ts.get_tools() + await ts.get_tools() + + assert mock_session.list_tools.await_count == 2 + + @pytest.mark.asyncio + async def test_clear_tools_cache_forces_refresh(self): + ts = MCPToolset(connection_params=_stdio_conn()) + + mock_mgr = MagicMock(spec=MCPSessionManager) + mock_session = AsyncMock() + mock_mgr.create_session = AsyncMock(return_value=mock_session) + + mock_session.list_tools = AsyncMock( + side_effect=[ + ListToolsResult( + tools=[ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ]), + ListToolsResult( + tools=[ + McpBaseTool(name="tool_b", description="desc_b", inputSchema={"type": "object"}), + ]), + ]) + + with patch.object(ts, "initialize"): + ts._mcp_session_manager = mock_mgr + first = await ts.get_tools() + ts.clear_tools_cache() + second = await ts.get_tools() + + assert [tool.name for tool in first] == ["tool_a"] + assert [tool.name for tool in second] == ["tool_b"] + assert mock_session.list_tools.await_count == 2 + + @pytest.mark.asyncio + async def test_tools_cache_ttl_expires(self): + ts = MCPToolset(connection_params=_stdio_conn(), tools_cache_ttl=1) + + mock_mgr = MagicMock(spec=MCPSessionManager) + mock_session = AsyncMock() + mock_mgr.create_session = AsyncMock(return_value=mock_session) + + mock_session.list_tools = AsyncMock( + side_effect=[ + ListToolsResult( + tools=[ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ]), + ListToolsResult( + tools=[ + McpBaseTool(name="tool_b", description="desc_b", inputSchema={"type": "object"}), + ]), + ]) + + with patch.object(ts, "initialize"), patch( + "trpc_agent_sdk.tools.mcp_tool._mcp_toolset.time.monotonic", + side_effect=[100.0, 100.5, 101.1, 101.1, 101.1], + ): + ts._mcp_session_manager = mock_mgr + first = await ts.get_tools() + cached = await ts.get_tools() + refreshed = await ts.get_tools() + + assert [tool.name for tool in first] == ["tool_a"] + assert [tool.name for tool in cached] == ["tool_a"] + assert [tool.name for tool in refreshed] == ["tool_b"] + assert mock_session.list_tools.await_count == 2 + + @pytest.mark.asyncio + async def test_list_changed_capability_uses_notification_driven_cache(self): + ts = MCPToolset(connection_params=_stdio_conn(), tools_cache_ttl=1) + + mock_mgr = MagicMock(spec=MCPSessionManager) + mock_session = AsyncMock() + mock_session.get_server_capabilities = MagicMock(return_value=_server_capabilities(list_changed=True)) + mock_mgr.create_session = AsyncMock(return_value=mock_session) + + mock_session.list_tools = AsyncMock( + return_value=ListToolsResult( + tools=[ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ] + )) + + with patch.object(ts, "initialize"), patch( + "trpc_agent_sdk.tools.mcp_tool._mcp_toolset.time.monotonic", + return_value=100.0, + ): + ts._mcp_session_manager = mock_mgr + first = await ts.get_tools() + second = await ts.get_tools() + + assert [tool.name for tool in first] == ["tool_a"] + assert [tool.name for tool in second] == ["tool_a"] + mock_session.list_tools.assert_awaited_once() + + @pytest.mark.asyncio + async def test_tool_list_changed_notification_clears_cache_and_chains_handler(self): + user_message_handler = AsyncMock() + ts = MCPToolset( + connection_params=_stdio_conn(), + session_group_params={"message_handler": user_message_handler}, + ) + ts._tools_cache = ListToolsResult( + tools=[ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ]) + ts._tools_cache_updated_at = 100.0 + + params = ts._build_session_group_params() + notification = mcp_types.ServerNotification(mcp_types.ToolListChangedNotification()) + await params["message_handler"](notification) + + assert ts._tools_cache is None + assert ts._tools_cache_updated_at is None + user_message_handler.assert_awaited_once_with(notification) + + @pytest.mark.asyncio + async def test_concurrent_get_tools_shares_cache_fill(self): + ts = MCPToolset(connection_params=_stdio_conn()) + + mock_mgr = MagicMock(spec=MCPSessionManager) + mock_session = AsyncMock() + mock_mgr.create_session = AsyncMock(return_value=mock_session) + mock_session.list_tools = AsyncMock( + return_value=ListToolsResult( + tools=[ + McpBaseTool(name="tool_a", description="desc_a", inputSchema={"type": "object"}), + ] + )) + + with patch.object(ts, "initialize"): + ts._mcp_session_manager = mock_mgr + first, second = await asyncio.gather(ts.get_tools(), ts.get_tools()) + + assert [tool.name for tool in first] == ["tool_a"] + assert [tool.name for tool in second] == ["tool_a"] + mock_session.list_tools.assert_awaited_once() + # --------------------------------------------------------------------------- # Tests: close diff --git a/trpc_agent_sdk/tools/mcp_tool/_mcp_toolset.py b/trpc_agent_sdk/tools/mcp_tool/_mcp_toolset.py index 38afd0a..36e08e0 100644 --- a/trpc_agent_sdk/tools/mcp_tool/_mcp_toolset.py +++ b/trpc_agent_sdk/tools/mcp_tool/_mcp_toolset.py @@ -25,11 +25,17 @@ from __future__ import annotations +import asyncio +import inspect +import time +from typing import cast from typing import List from typing import Optional from typing import Union from typing_extensions import override +from mcp import ClientSession +from mcp import types as mcp_types from mcp.types import ListToolsResult from trpc_agent_sdk.abc import ToolPredicate @@ -81,7 +87,9 @@ def __init__(self, mcp_tool_cls=MCPTool, filters_name: Optional[list[str]] = None, filters: Optional[list[BaseFilter]] = None, - session_group_params: Optional[dict] = None): + session_group_params: Optional[dict] = None, + cache_tools: bool = True, + tools_cache_ttl: Optional[float] = 60.0): """Initializes the MCPToolset. Args: @@ -103,10 +111,17 @@ def __init__(self, filters_name: List of filter names to apply to the tools filters: List of filter instances to apply to the tools session_group_params: Optional parameters for session group management + cache_tools: Whether to cache the MCP server's list_tools response. + tools_cache_ttl: Cache lifetime in seconds for MCP servers that do not + support tools.listChanged notifications. Servers that support + listChanged use notification-driven invalidation instead. """ super().__init__(tool_filter=tool_filter, is_include_all_tools=is_include_all_tools) + if tools_cache_ttl is not None and tools_cache_ttl < 0: + raise ValueError("tools_cache_ttl must be non-negative.") + self._connection_params = connection_params self._mcp_tool_cls = mcp_tool_cls # Create the session manager that will handle the MCP connection @@ -114,6 +129,11 @@ def __init__(self, self._filters = filters self._filters_name = filters_name self._session_group_params = session_group_params or {} + self._cache_tools = cache_tools + self._tools_cache_ttl = tools_cache_ttl + self._tools_cache_lock = asyncio.Lock() + self._tools_cache: ListToolsResult | None = None + self._tools_cache_updated_at: float | None = None def _checker_required_params(self): """Validates that all required parameters are properly initialized. @@ -126,6 +146,81 @@ def _checker_required_params(self): if not self._mcp_session_manager: raise ValueError("_mcp_session_manager is None.") + def clear_tools_cache(self) -> None: + """Clears the cached MCP tool definitions. + + Call this when the MCP server's tool set is known to have changed and + the next get_tools call should re-query list_tools. + """ + self._tools_cache = None + self._tools_cache_updated_at = None + + def _server_supports_tool_list_changed(self, session: ClientSession) -> bool: + """Returns whether the server can notify client about tool list changes.""" + try: + get_capabilities = getattr(session, "get_server_capabilities", None) + if get_capabilities is None: + return False + capabilities = get_capabilities() + if inspect.isawaitable(capabilities): + close = getattr(capabilities, "close", None) + if close is not None: + close() + return False + except Exception: # pylint: disable=broad-except + return False + + tools_capability = getattr(capabilities, "tools", None) + return getattr(tools_capability, "listChanged", False) is True + + def _is_tools_cache_valid(self, session: ClientSession) -> bool: + """Returns whether the cached list_tools response can be reused.""" + if not self._cache_tools or self._tools_cache is None: + return False + if self._server_supports_tool_list_changed(session): + return True + if self._tools_cache_ttl is None: + return False + if self._tools_cache_updated_at is None: + return False + return time.monotonic() - self._tools_cache_updated_at < self._tools_cache_ttl + + async def _get_tools_response(self, session: ClientSession) -> ListToolsResult: + """Returns MCP tool definitions, using cache when enabled.""" + if not self._cache_tools: + return await session.list_tools() + + if self._is_tools_cache_valid(session): + return cast(ListToolsResult, self._tools_cache) + + async with self._tools_cache_lock: + if self._is_tools_cache_valid(session): + return cast(ListToolsResult, self._tools_cache) + + tools_response: ListToolsResult = await session.list_tools() + self._tools_cache = tools_response + self._tools_cache_updated_at = time.monotonic() + return tools_response + + def _build_session_group_params(self) -> dict: + """Builds ClientSession params with tool-change notification handling.""" + params = dict(self._session_group_params) + if not self._cache_tools: + return params + + user_message_handler = params.get("message_handler") + + async def message_handler(message): + if (isinstance(message, mcp_types.ServerNotification) + and isinstance(message.root, mcp_types.ToolListChangedNotification)): + self.clear_tools_cache() + + if user_message_handler is not None: + await user_message_handler(message) + + params["message_handler"] = message_handler + return params + @override def initialize(self) -> None: """Initialize the toolset.""" @@ -135,7 +230,7 @@ def initialize(self) -> None: self._connection_params = convert_conn_params(self._connection_params) self._mcp_session_manager = MCPSessionManager( connection_params=self._connection_params, - session_group_params=self._session_group_params, + session_group_params=self._build_session_group_params(), ) self._checker_required_params() @@ -159,7 +254,7 @@ async def get_tools( session = await self._mcp_session_manager.create_session() # Fetch available tools from the MCP server - tools_response: ListToolsResult = await session.list_tools() + tools_response = await self._get_tools_response(session) # Apply filtering based on context and tool_filter tools = [] @@ -184,6 +279,7 @@ async def close(self) -> None: gracefully to avoid blocking application shutdown. """ try: + self.clear_tools_cache() if self._mcp_session_manager is None: return await self._mcp_session_manager.close() From 3705c20637fa552d95dd558a84c48d6b34bb566c Mon Sep 17 00:00:00 2001 From: weimch Date: Mon, 1 Jun 2026 19:09:59 +0800 Subject: [PATCH 5/7] =?UTF-8?q?Bugfix:=20=E4=BF=AE=E5=A4=8DSqlSessionServi?= =?UTF-8?q?ce=E5=9C=A8sqlite=E4=B8=8B=E5=9B=A0=E6=97=B6=E5=8C=BA=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE=E4=B8=8D=E5=AF=B9=E5=AF=BC=E8=87=B4=E9=A2=91=E7=B9=81?= =?UTF-8?q?warn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - é—®é¢˜ļ¼šå½“å‰åœØSqlSessionServiceēš„å®žēŽ°äø­ļ¼Œé»˜č®¤åˆ›å»ŗDBēš„č”Øäø­ļ¼Œupdate_time使用了sqlalchemyēš„func.nowļ¼Œä½†åœØę›“ę–°ę—¶é—“ę—¶ļ¼Œä½æē”Øäŗ†datetime.now,在sqliteå®žēŽ°é‡Œļ¼Œfunc.nowé»˜č®¤ä½æē”Øäŗ†utcēš„ę—¶é—“ļ¼Œč€Œdatetime.nowäøę˜Æutcēš„ę—¶é—“ļ¼ŒåÆ¼č‡“append_eventę—¶ļ¼Œå› ę—¶åŒŗäøåŒļ¼ŒåÆ¼č‡“diff失蓄,warnå‘Šč­¦ - č§£å†³ę–¹ę”ˆļ¼šę€»ę˜Æä½æē”Øfunc.nowę„ę›“ę–°ę—¶é—“ --- .../sessions/_sql_session_service.py | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/trpc_agent_sdk/sessions/_sql_session_service.py b/trpc_agent_sdk/sessions/_sql_session_service.py index dc07e2b..82dd41a 100644 --- a/trpc_agent_sdk/sessions/_sql_session_service.py +++ b/trpc_agent_sdk/sessions/_sql_session_service.py @@ -103,6 +103,25 @@ class SessionStorageBase(DeclarativeBase): pass +def _storage_dialect_name(storage: SessionStorageBase) -> Optional[str]: + orm_session = inspect(storage).session + if orm_session is None or orm_session.bind is None: + return None + return orm_session.bind.dialect.name + + +def _timestamp_tz(value: datetime, dialect_name: Optional[str]) -> float: + if dialect_name == "sqlite": + return value.replace(tzinfo=timezone.utc).timestamp() + return value.timestamp() + + +def _expire_before(sql_session: SqlSession, ttl_seconds: int) -> datetime: + if sql_session.bind is not None and sql_session.bind.dialect.name == "sqlite": + return datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(seconds=ttl_seconds) + return datetime.now() - timedelta(seconds=ttl_seconds) + + class StorageSession(SessionStorageBase): """Represents a session stored in the database with TTL support. @@ -135,14 +154,11 @@ def __repr__(self): @property def _dialect_name(self) -> Optional[str]: - session = inspect(self).session - return session.bind.dialect.name if session else None # type: ignore + return _storage_dialect_name(self) @property def update_timestamp_tz(self) -> float: - if self._dialect_name == "sqlite": - return self.update_time.replace(tzinfo=timezone.utc).timestamp() - return self.update_time.timestamp() + return _timestamp_tz(self.update_time, self._dialect_name) def to_session( self, @@ -306,6 +322,10 @@ class StorageAppState(SessionStorageBase): state: Mapped[MutableDict[str, Any]] = mapped_column(MutableDict.as_mutable(DynamicJSON), default={}) update_time: Mapped[datetime] = mapped_column(PreciseTimestamp, default=func.now(), onupdate=func.now()) + @property + def update_timestamp_tz(self) -> float: + return _timestamp_tz(self.update_time, _storage_dialect_name(self)) + class StorageUserState(SessionStorageBase): """Represents a user state stored in the database with TTL support. @@ -319,6 +339,10 @@ class StorageUserState(SessionStorageBase): state: Mapped[MutableDict[str, Any]] = mapped_column(MutableDict.as_mutable(DynamicJSON), default={}) update_time: Mapped[datetime] = mapped_column(PreciseTimestamp, default=func.now(), onupdate=func.now()) + @property + def update_timestamp_tz(self) -> float: + return _timestamp_tz(self.update_time, _storage_dialect_name(self)) + class SqlSessionService(BaseSessionService): """A SQL database implementation of the session service. @@ -452,7 +476,7 @@ async def list_sessions(self, *, app_name: str, user_id: str) -> ListSessionsRes sessions = [] for storage_session in results: - if self._session_config.is_expired_by_timestamp(storage_session.update_time.timestamp()): + if self._session_config.is_expired_by_timestamp(storage_session.update_timestamp_tz): logger.debug("Cleaned up expired session: %s/%s/%s", storage_session.app_name, storage_session.user_id, storage_session.id) continue @@ -593,7 +617,7 @@ async def _update_app_state(self, sql_session: SqlSession, app_name: str, state_ await self._sql_storage.add(sql_session, storage_app_state) else: storage_app_state.state = app_state # type: ignore - storage_app_state.update_time = datetime.now() + storage_app_state.update_time = func.now() return app_state @@ -621,9 +645,9 @@ async def _get_app_state(self, sql_session: SqlSession, app_name: str) -> dict[s app_state = {} if storage_app_state: - if not self._session_config.is_expired_by_timestamp(storage_app_state.update_time.timestamp()): + if not self._session_config.is_expired_by_timestamp(storage_app_state.update_timestamp_tz): app_state = storage_app_state.state - storage_app_state.update_time = datetime.now() + storage_app_state.update_time = func.now() await self._sql_storage.commit(sql_session) return app_state @@ -634,9 +658,9 @@ async def _get_user_state(self, sql_session: SqlSession, app_name: str, user_id: user_state = {} if storage_user_state: - if not self._session_config.is_expired_by_timestamp(storage_user_state.update_time.timestamp()): + if not self._session_config.is_expired_by_timestamp(storage_user_state.update_timestamp_tz): user_state = storage_user_state.state - storage_user_state.update_time = datetime.now() + storage_user_state.update_time = func.now() await self._sql_storage.commit(sql_session) return user_state @@ -648,11 +672,11 @@ async def _get_session(self, sql_session: SqlSession, app_name: str, user_id: st if storage_session is None: return None - if self._session_config.is_expired_by_timestamp(storage_session.update_time.timestamp()): + if self._session_config.is_expired_by_timestamp(storage_session.update_timestamp_tz): logger.debug("Session %s is expired", session_id) return None - storage_session.update_time = datetime.now() + storage_session.update_time = func.now() await self._sql_storage.commit(sql_session) return storage_session @@ -665,7 +689,7 @@ async def _cleanup_expired_async(self) -> None: """ async with self._sql_storage.create_db_session() as sql_session: # Calculate expiration threshold once in application time for cross-database compatibility. - expire_before = datetime.now() - timedelta(seconds=self._session_config.ttl.ttl_seconds) + expire_before = _expire_before(sql_session, self._session_config.ttl.ttl_seconds) total_deleted = 0 # Batch delete expired sessions From 02120a7e2f65bc6c1512fbbbe7a2ce48baa74cca Mon Sep 17 00:00:00 2001 From: raychen <815315825@qq.com> Date: Tue, 26 May 2026 20:29:18 +0800 Subject: [PATCH 6/7] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=20skills=20?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E5=8F=AF=E8=87=AA=E5=8A=A8=E6=81=A2=E5=A4=8D?= =?UTF-8?q?=E7=9A=84=20Cube=20sandbox=20runtime?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ę–°å¢ž CubeClientConfig 和 create_cube_sandbox_clientļ¼Œē»Ÿäø€ Cube sandbox client åˆ›å»ŗå…„å£ć€‚ - 在 CubeSandboxClient äø­ę”ÆęŒåÆé€‰ auto_recover,sandbox čæ‡ęœŸęˆ–äøå­˜åœØę—¶č‡ŖåŠØåˆ›å»ŗę–° sandboxļ¼Œå¹¶é‡čÆ•å½“å‰å¤±č“„ę“ä½œäø€ę¬”ć€‚ - 将 Cube čæ”å›žēš„ ā€œrequested resource does not existā€ čÆ†åˆ«äøŗ sandbox å¤±ę•ˆäæ”å·ļ¼Œē”ØäŗŽč§¦å‘č‡ŖåŠØę¢å¤ć€‚ - ę”ÆęŒé€ščæ‡ CubeSandboxClient ē›“ęŽ„åˆ›å»ŗ CubeWorkspaceRuntime,并暓露 sandbox ē”Ÿå‘½å‘ØęœŸč¾…åŠ©ę–¹ę³•ć€‚ - ę–°å¢žå…±äŗ« workspace runtime resolver helperļ¼Œå¹¶é€ščæ‡ repository.get_workspace_runtime(ctx) ē»Ÿäø€čŽ·å– skill repository ēš„ runtime怂 - ꛓꖰ skill load/run/exec/stager č·Æå¾„ļ¼Œē»Ÿäø€ä½æē”Ø repository ēŗ§ workspace runtimeļ¼Œē”®äæåŒäø€ repository äø‹ēš„å·„å…·å…±äŗ«åŒäø€äøŖ workspace runtime äøŠäø‹ę–‡ć€‚ --- examples/skills_with_cube/.env | 9 + examples/skills_with_cube/README.md | 99 ++++++++ examples/skills_with_cube/agent/__init__.py | 5 + examples/skills_with_cube/agent/agent.py | 41 +++ examples/skills_with_cube/agent/config.py | 19 ++ examples/skills_with_cube/agent/prompts.py | 36 +++ examples/skills_with_cube/agent/tools.py | 116 +++++++++ examples/skills_with_cube/run_agent.py | 146 +++++++++++ .../skills/python-math/SKILL.md | 31 +++ .../skills/python-math/scripts/fib.py | 24 ++ tests/code_executors/cube/test_bug_hunt.py | 7 +- .../code_executors/cube/test_code_executor.py | 12 +- tests/code_executors/cube/test_runtime.py | 43 ++++ tests/code_executors/cube/test_sandbox.py | 21 +- tests/skills/stager/test_base_stager.py | 18 ++ tests/skills/tools/test_copy_stager.py | 1 + tests/skills/tools/test_skill_load.py | 31 +++ tests/skills/tools/test_skill_run.py | 13 + trpc_agent_sdk/code_executors/__init__.py | 2 + .../code_executors/_base_workspace_runtime.py | 20 ++ .../code_executors/cube/__init__.py | 4 + .../code_executors/cube/_code_executor.py | 13 +- .../code_executors/cube/_runtime.py | 44 +++- .../code_executors/cube/_sandbox.py | 234 ++++++++++++------ trpc_agent_sdk/code_executors/cube/_types.py | 24 +- trpc_agent_sdk/skills/_repository.py | 14 +- trpc_agent_sdk/skills/_toolset.py | 1 + trpc_agent_sdk/skills/stager/_base_stager.py | 2 +- trpc_agent_sdk/skills/tools/_skill_exec.py | 2 +- trpc_agent_sdk/skills/tools/_skill_load.py | 2 +- trpc_agent_sdk/skills/tools/_skill_run.py | 19 +- .../skills/tools/_workspace_exec.py | 5 +- 32 files changed, 927 insertions(+), 131 deletions(-) create mode 100644 examples/skills_with_cube/.env create mode 100644 examples/skills_with_cube/README.md create mode 100644 examples/skills_with_cube/agent/__init__.py create mode 100644 examples/skills_with_cube/agent/agent.py create mode 100644 examples/skills_with_cube/agent/config.py create mode 100644 examples/skills_with_cube/agent/prompts.py create mode 100644 examples/skills_with_cube/agent/tools.py create mode 100644 examples/skills_with_cube/run_agent.py create mode 100644 examples/skills_with_cube/skills/python-math/SKILL.md create mode 100644 examples/skills_with_cube/skills/python-math/scripts/fib.py diff --git a/examples/skills_with_cube/.env b/examples/skills_with_cube/.env new file mode 100644 index 0000000..e24d24f --- /dev/null +++ b/examples/skills_with_cube/.env @@ -0,0 +1,9 @@ +# Set TRPC_AGENT_API_KEY态TRPC_AGENT_BASE_URL态TRPC_AGENT_MODEL_NAME +TRPC_AGENT_API_KEY=your-api-key +TRPC_AGENT_BASE_URL=your-base-url +TRPC_AGENT_MODEL_NAME=your-model-name + + +CUBE_TEMPLATE_ID=your-cube-template-id +E2B_API_URL=your-e2b-api-url +E2B_API_KEY=your-e2b-api-key diff --git a/examples/skills_with_cube/README.md b/examples/skills_with_cube/README.md new file mode 100644 index 0000000..6870353 --- /dev/null +++ b/examples/skills_with_cube/README.md @@ -0,0 +1,99 @@ +# Skills Cube äøŽ stage_inputs 示例 + +ęœ¬ē¤ŗä¾‹ę¼”ē¤ŗåœØč…¾č®Æäŗ‘ Agent 沙箱 Cube å·„ä½œåŒŗäø­ę‰§č”Œ `skill_run`ļ¼Œå¹¶é€ščæ‡ `host://`态`workspace://`态`skill://` ē­‰č¾“å…„ę–¹ę”ˆę¼”ē¤ŗ `stage_inputs` å¦‚ä½•ęŠŠęœ¬åœ°č¾“å…„äøŠä¼ /å¤åˆ¶åˆ°čæœē«Æę²™ē®±ć€‚ + +## 关键特性 + +- `create_cube_sandbox_client(CubeClientConfig(auto_recover=True))`ļ¼šåˆ›å»ŗčæœē«Æ Cube 沙箱;底层 sandbox čæ‡ęœŸ/äøå­˜åœØę—¶ļ¼Œ`CubeSandboxClient` ä¼šč‡ŖåŠØåˆ›å»ŗę–° sandbox å¹¶é‡čÆ•å½“å‰ę“ä½œäø€ę¬” +- `agent/tools.py` äø­ `build_cube_skill_run_payload` ē”Ÿęˆå›ŗå®šå½¢ę€ēš„ `skill_run` č“Ÿč½½ä¾›ęØ”åž‹č°ƒē”Ø +- `host://` č¾“å…„ä¼šä»Žčæč”Œē¤ŗä¾‹ēš„ęœ¬ęœŗč·Æå¾„äøŠä¼ åˆ° Cube ę²™ē®±ļ¼Œäøä¾čµ– Docker bind mount +- `run_agent.py` ä¼šå‡†å¤‡ē¤ŗä¾‹ `/tmp/skillrun-inputs/sales.csv`,demo ē»“ęŸåŽé”€ęÆ Cube 沙箱 + +## Agent å±‚ēŗ§ē»“ęž„čÆ“ę˜Ž + +- ę ¹čŠ‚ē‚¹ļ¼š`LlmAgent`ļ¼ŒęŒ‚č½½ `SkillToolSet`(Cube čæč”Œę—¶ + ęŠ€čƒ½ä»“åŗ“ļ¼‰ +- 无子 Agent + +## å…³é”®ä»£ē č§£é‡Š + +- `agent/tools.py`ļ¼šé€ščæ‡ `CubeClientConfig(auto_recover=True)` åˆ›å»ŗ `CubeSandboxClient`ļ¼Œå†é€ščæ‡ `create_cube_workspace_runtime` åˆ›å»ŗ workspace runtime +- `agent/agent.py`ļ¼šå¼‚ę­„åˆ›å»ŗ agent,并把 workspace runtime čæ”å›žē»™ runner åšęœ€ē»ˆé”€ęÆ +- `run_agent.py`ļ¼šē»„č£…å« `inputs` ę•°ē»„ēš„ JSON ęē¤ŗčÆļ¼Œé©±åŠØå•ę¬” `skill_run` ę¼”ē¤ŗļ¼Œå¹¶åœØ finally 中销毁沙箱 + +## ēŽÆå¢ƒäøŽčæč”Œ + +- Python 3.12;仓库根目录安装 Cube extra:`pip install -e '.[cube]'` +- é…ē½® `TRPC_AGENT_API_KEY`态`TRPC_AGENT_BASE_URL`态`TRPC_AGENT_MODEL_NAME` +- é…ē½® Cube ēŽÆå¢ƒå˜é‡ļ¼š`CUBE_TEMPLATE_ID`态`E2B_API_URL`态`E2B_API_KEY` +- åÆé€‰ļ¼š`SKILLS_ROOT`态`CUBE_EXECUTE_TIMEOUT`(默认 `30`)、`CUBE_IDLE_TIMEOUT`(默认 `600`) + +```bash +cd examples/skills_with_cube +python3 run_agent.py +``` + +### 验证 sandbox é‡å»ŗåŽēš„ Skill runtime ę¢å¤ + +äøŗäŗ†éŖŒčÆäøšåŠ”äø»åŠØé‡å»ŗ sandbox åŽļ¼ŒSkill å·„å…·é“¾ä»čƒ½ē»§ē»­ä½æē”Øå½“å‰ workspace runtimeļ¼ŒåÆä»„å¼€åÆäø‹é¢ēš„ēŽÆå¢ƒå˜é‡ļ¼š + +```bash +SKILLS_WITH_CUBE_RECREATE_BETWEEN_RUNS=1 python3 run_agent.py +``` + +čÆ„ęØ”å¼ä¼ščæžē»­å‘čµ·äø¤ę¬”ē›øåŒēš„ `skill_run` čÆ·ę±‚ļ¼šē¬¬äø€ę¬”ę­£åøøčæč”Œļ¼›ē¬¬äŗŒę¬”čÆ·ę±‚å‰é€ščæ‡ workspace runtime äø»åŠØé‡å»ŗ Cube sandboxć€‚åŖč¦ē¬¬äŗŒę¬”čÆ·ę±‚ä¹Ÿčƒ½ę­£åøøå®Œęˆļ¼Œå°±čÆ“ę˜Ž Skill å·„å…·é“¾åÆä»„ē»§ē»­ä½æē”Øę–°ēš„ runtimeļ¼Œč€Œäøę˜Æē»§ē»­č®æé—®čæ‡ęœŸ sandbox怂 + +### 验证 sandbox å¤±ę•ˆåŽēš„č‡ŖåŠØę¢å¤ + +äøŗäŗ†éŖŒčÆę›“ęŽ„čæ‘ēœŸå®žåœŗę™Æēš„č‡ŖåŠØę¢å¤č·Æå¾„ļ¼ŒåÆä»„å¼€åÆäø‹é¢ēš„äø“ę—¶ęµ‹čÆ•å¼€å…³ļ¼š + +```bash +SKILLS_WITH_CUBE_KILL_BETWEEN_RUNS=1 python3 run_agent.py +``` + +čÆ„ęØ”å¼åŒę ·ä¼ščæžē»­å‘čµ·äø¤ę¬”ē›øåŒēš„ `skill_run` čÆ·ę±‚ć€‚ē¬¬äø€ę¬”čÆ·ę±‚ęˆåŠŸåŽļ¼Œē¤ŗä¾‹ä¼šē›“ęŽ„ kill 远端 Cube sandboxļ¼Œä½†äæē•™ęœ¬åœ° `CubeSandboxClient` äø­ēš„ę—§å„ęŸ„ć€‚ē¬¬äŗŒę¬”čÆ·ę±‚ē»§ē»­ä½æē”Øę—§å„ęŸ„č®æé—®čæœē«Æ sandboxļ¼Œę­¤ę—¶ Cube ä¼ščæ”å›žē±»ä¼¼ `Code.unknown: The requested resource does not exist` ēš„é”™čÆÆļ¼›å¦‚ęžœ `auto_recover=True` ē”Ÿę•ˆļ¼Œę—„åæ—äø­ä¼šå‡ŗēŽ°ļ¼š + +```txt +Cube sandbox expired; recreating sandbox client: Code.unknown: The requested resource does not exist +Cube sandbox client using sandbox: +``` + +éšåŽē¬¬äŗŒę¬” `skill_run` ä»åŗ”čæ”å›ž `exit_code=0`ļ¼ŒčÆ“ę˜Žę—§ sandbox č¢«å¹³å°/å¤–éƒØé”€ęÆåŽļ¼Œ`CubeSandboxClient` å·²č‡ŖåŠØę¢å¤å¹¶ē»§ē»­ę‰§č”Œå½“å‰ę“ä½œć€‚ + +ę³Øę„ļ¼š`SKILLS_WITH_CUBE_KILL_BETWEEN_RUNS` ę˜Æäøŗäŗ†éŖŒčÆę¢å¤ęœŗåˆ¶č€ŒåŠ å…„ēš„äø“ę—¶ä»£ē ļ¼Œä¼šä½æē”Øē§ęœ‰å„ęŸ„ē›“ęŽ„ kill 远端 sandboxć€‚ę­£å¼ęäŗ¤ē¤ŗä¾‹ęˆ–ē”Ÿäŗ§ä»£ē ę—¶åÆä»„åˆ é™¤čÆ„ęµ‹čÆ•å¼€å…³åŠåÆ¹åŗ” helper怂 + +## ęœŸęœ›čæč”Œē»“ęžœ + +```txt +[START] skills_with_cube +... +created Cube sandbox ... +... +šŸ”§ [Invoke Tool:: skill_run({... 'inputs': [ + 'host:///tmp/skillrun-inputs/sales.csv', + 'workspace://skills/python-math/SKILL.md', + 'skill://python-math/scripts/fib.py', +], ...}) +šŸ“Š [Tool Result: { + 'stdout': '', 'stderr': '', 'exit_code': 0, + 'output_files': [ + {'name': 'out/fib.txt', 'content': '0\n1\n1\n2\n3\n5\n8\n13\n21\n34\n', ...}, + {'name': 'out/staged_inputs_tree.txt', 'content': + 'work/inputs:\nsales.csv\n---\nwork/staged_inputs:\nfib.py\npython-math_skill.md\n', ...}, + ], + ... +}] +... +``` + +## ē»“ęžœåˆ†ęžļ¼ˆę˜Æå¦ē¬¦åˆč¦ę±‚ļ¼‰ + +ē¬¦åˆęœ¬ē¤ŗä¾‹ęµ‹čÆ•č¦ę±‚ļ¼šCube ę²™ē®±ęˆåŠŸåˆ›å»ŗå¹¶å®Œęˆ `skill_run` č°ƒē”Øé“¾ļ¼›`host://` / `workspace://` / `skill://` äø‰ē§ input scheme éƒ½ęˆåŠŸč½å…„čæœē«Æå·„ä½œåŒŗļ¼Œč¾“å‡ŗę–‡ä»¶ `out/fib.txt` 和 `out/staged_inputs_tree.txt` ę­£åøøäŗ§å‡ŗļ¼Œčæ›ēØ‹ä»„ `exit_code=0` ē»“ęŸć€‚ + +å¦‚ęžœä½æē”Ø `SKILLS_WITH_CUBE_KILL_BETWEEN_RUNS=1`ļ¼Œčæ˜éœ€č¦ē”®č®¤ē¬¬äŗŒę¬”čÆ·ę±‚äø­å‡ŗēŽ°č‡ŖåŠØę¢å¤ę—„åæ—ļ¼Œå¹¶äø”ē¬¬äŗŒę¬” `skill_run` ä»ē„¶ęˆåŠŸć€‚čæ™č”Øē¤ŗēœŸå®žēš„ā€œę—§ sandbox äøå­˜åœØ -> client č‡ŖåŠØé‡å»ŗ -> å½“å‰ę“ä½œé‡čÆ•ā€é“¾č·Æé€ščæ‡ć€‚ + +## é€‚ē”Øåœŗę™Æå»ŗč®® + +- éœ€č¦åœØčæœē«Æ Cube ę²™ē®±å†…ę‰§č”ŒęŠ€čƒ½ć€å¹¶éŖŒčÆęœ¬åœ°č¾“å…„äøŠä¼ åˆ°ę²™ē®±ę—¶å‚č€ƒęœ¬ē¤ŗä¾‹ +- č°ƒčÆ• `workspace://` ę—¶åŗ”ē”®äæęŗę–‡ä»¶å·²å­˜åœØäŗŽå½“å‰ workspaceļ¼Œå†å¤åˆ¶ęˆ–é“¾ęŽ„åˆ°ē›®ę ‡č·Æå¾„ +- é•æē”Ÿå‘½å‘ØęœŸ agent 建议开启 `CubeClientConfig(auto_recover=True)`ļ¼Œéæå… Cube sandbox å› č¶…ę—¶ęˆ–å¹³å°ęø…ē†åŽåÆ¼č‡“åŽē»­ęŠ€čƒ½č°ƒē”ØęŒē»­å¤±č“„ +- č‡ŖåŠØę¢å¤ä¼šåˆ›å»ŗå…Øę–°ēš„ sandbox,远端 workspace å†…å®¹äøä¼šč‡ŖåŠØä»Žę—§ sandbox 迁移;Skill staging 和 workspace åˆ›å»ŗé€»č¾‘éœ€č¦čƒ½åœØę–° sandbox äøŠé‡ę–°ę‰§č”Œ diff --git a/examples/skills_with_cube/agent/__init__.py b/examples/skills_with_cube/agent/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/skills_with_cube/agent/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/skills_with_cube/agent/agent.py b/examples/skills_with_cube/agent/agent.py new file mode 100644 index 0000000..2adb7f8 --- /dev/null +++ b/examples/skills_with_cube/agent/agent.py @@ -0,0 +1,41 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Agent for Cube-backed skill runs.""" + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import LLMModel +from trpc_agent_sdk.models import OpenAIModel + +from .config import get_model_config +from .prompts import INSTRUCTION +from .tools import create_skill_tool_set + + +def _create_model() -> LLMModel: + """ Create a model""" + api_key, url, model_name = get_model_config() + model = OpenAIModel(model_name=model_name, api_key=api_key, base_url=url) + return model + + +async def create_agent(): + """Create a Cube-backed skill run agent and its workspace runtime.""" + + # Create tools + skill_tool_set, skill_repository, workspace_runtime = await create_skill_tool_set() + agent = LlmAgent( + name="skill_run_agent_with_cube", + description="A professional skill run assistant that can use Agent Skills.", + model=_create_model(), + # Use state variables for template replacement - Demonstration of the {var} syntax + instruction=INSTRUCTION, + tools=[skill_tool_set], + skill_repository=skill_repository, + ) + return agent, workspace_runtime + + +root_agent = None diff --git a/examples/skills_with_cube/agent/config.py b/examples/skills_with_cube/agent/config.py new file mode 100644 index 0000000..db0d491 --- /dev/null +++ b/examples/skills_with_cube/agent/config.py @@ -0,0 +1,19 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +""" Agent config module""" + +import os + + +def get_model_config() -> tuple[str, str, str]: + """Get model config from environment variables""" + api_key = os.getenv('TRPC_AGENT_API_KEY', '') + url = os.getenv('TRPC_AGENT_BASE_URL', '') + model_name = os.getenv('TRPC_AGENT_MODEL_NAME', '') + if not api_key or not url or not model_name: + raise ValueError('''TRPC_AGENT_API_KEY, TRPC_AGENT_BASE_URL, + and TRPC_AGENT_MODEL_NAME must be set in environment variables''') + return api_key, url, model_name diff --git a/examples/skills_with_cube/agent/prompts.py b/examples/skills_with_cube/agent/prompts.py new file mode 100644 index 0000000..690c7fd --- /dev/null +++ b/examples/skills_with_cube/agent/prompts.py @@ -0,0 +1,36 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +""" prompts for agent""" + +INSTRUCTION = """ +Be a concise, helpful assistant that can use Agent Skills. + +When a task may need tools, first ask to list skills or suggest one. +Load a skill only when needed, then run commands from its docs exactly. +Prefer safe defaults; ask clarifying questions if anything is ambiguous. +When running, include output_files patterns if files are expected. +Summarize results, note saved files, and propose next steps briefly. + +Inside a Cube skill workspace, inputs staged from host:// are uploaded +copies in the remote sandbox. Treat inputs/ and work/inputs/ as input +data and write new results under out/ or $OUTPUT_DIR. + +When chaining multiple skills, read previous results directly from +out/ (or $OUTPUT_DIR) and write new files back to out/. Prefer using +skill_run inputs/outputs fields to map files instead of shell commands +like cp or mv where possible. + +When using a skill, follow this workflow: +1. First call skill_load to load the skill documentation +2. Always call skill_list_docs immediately after skill_load to verify what documents have been loaded, + including documents from subdirectories (e.g., references/ folder) +3. If needed, use skill_select_docs to add additional documents +4. Call skill_list_docs again after skill_select_docs to confirm the final set of loaded documents +5. Finally use skill_run to execute commands + +This ensures you can verify that all relevant documentation files, including those in subdirectories, +are properly loaded before executing commands. +""" diff --git a/examples/skills_with_cube/agent/tools.py b/examples/skills_with_cube/agent/tools.py new file mode 100644 index 0000000..35150e3 --- /dev/null +++ b/examples/skills_with_cube/agent/tools.py @@ -0,0 +1,116 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Tools for the Cube-backed skill agent.""" +import os +from pathlib import Path +from typing import Any + +from trpc_agent_sdk.code_executors import WorkspaceInputSpec +from trpc_agent_sdk.code_executors.cube import CubeClientConfig +from trpc_agent_sdk.code_executors.cube import CubeWorkspaceRuntime +from trpc_agent_sdk.code_executors.cube import CubeWorkspaceRuntimeConfig +from trpc_agent_sdk.code_executors.cube import create_cube_sandbox_client +from trpc_agent_sdk.code_executors.cube import create_cube_workspace_runtime +from trpc_agent_sdk.skills import ENV_SKILLS_ROOT +from trpc_agent_sdk.skills import SkillToolSet +from trpc_agent_sdk.skills import create_default_skill_repository + + +def _get_skill_paths() -> str: + """Get the skill paths.""" + skills_root = os.getenv(ENV_SKILLS_ROOT) + if skills_root: + return skills_root + current_path = Path(__file__).parent + path = str(current_path.parent / "skills") + # convert to file URL + # path = "file://" + path + # "http://{host}:{port}/{path}/{filename}.{extension}" + # path = "http://localhost:8000/skills/skills.tar.gz" + return path + + +def _cube_client_config() -> CubeClientConfig: + """Build Cube executor config from environment variables.""" + return CubeClientConfig( + execute_timeout=float(os.getenv("CUBE_EXECUTE_TIMEOUT", "30")), + idle_timeout=int(os.getenv("CUBE_IDLE_TIMEOUT", "600")), + auto_recover=True, + ) + + +async def create_skill_tool_set() -> tuple[SkillToolSet, Any, CubeWorkspaceRuntime]: + """Create a Cube-backed skill tool set and its Cube runtime.""" + tool_kwargs = { + "save_as_artifacts": True, + "omit_inline_content": False, + } + + cfg = _cube_client_config() + sandbox_client = await create_cube_sandbox_client(cfg) + workspace_runtime = create_cube_workspace_runtime( + sandbox_client=sandbox_client, + execute_timeout=cfg.execute_timeout, + workspace_cfg=CubeWorkspaceRuntimeConfig(), + ) + print(f"[skills_with_cube] using Cube sandbox: {workspace_runtime.sandbox_id}", flush=True) + skill_paths = _get_skill_paths() + repository = create_default_skill_repository(skill_paths, workspace_runtime=workspace_runtime) + toolset = SkillToolSet(repository=repository, run_tool_kwargs=tool_kwargs) + return toolset, repository, workspace_runtime + + +def build_cube_stage_inputs_specs(inputs_host: str = "/tmp/skillrun-inputs") -> list[WorkspaceInputSpec]: + """Build example input specs for Cube runtime. + + The returned specs demonstrate the supported input schemes used by + ``CubeWorkspaceFS.stage_inputs``: + + - ``host://`` : upload from a host path into the remote Cube sandbox + - ``workspace://``: reuse a file already present in current workspace + - ``skill://`` : reference a file under workspace ``skills/`` + """ + return [ + WorkspaceInputSpec( + src=f"host://{inputs_host}/sales.csv", + dst="work/inputs/sales.csv", + mode="link", + ), + WorkspaceInputSpec( + # This file exists after skill staging, so the workspace:// demo is stable. + src="workspace://skills/python-math/SKILL.md", + dst="work/staged_inputs/python-math_skill.md", + mode="copy", + ), + WorkspaceInputSpec( + src="skill://python-math/scripts/fib.py", + dst="work/staged_inputs/fib.py", + mode="copy", + ), + ] + + +def build_cube_skill_run_payload(skill_name: str = "python-math", + inputs_host: str = "/tmp/skillrun-inputs") -> dict[str, Any]: + """Build a full ``skill_run`` payload for Cube mode demonstration. + + This payload can be used directly when invoking the ``skill_run`` tool: + it stages input schemes into the remote Cube workspace and writes outputs + under ``out/``. + """ + return { + "skill": + skill_name, + "cwd": + f"$SKILLS_DIR/{skill_name}", + "command": ("python scripts/fib.py 10 > out/fib.txt && " + "(ls -R work/inputs; echo '---'; ls -R work/staged_inputs) > out/staged_inputs_tree.txt"), + "inputs": [spec.model_dump() for spec in build_cube_stage_inputs_specs(inputs_host=inputs_host)], + "output_files": [ + "out/fib.txt", + "out/staged_inputs_tree.txt", + ], + } diff --git a/examples/skills_with_cube/run_agent.py b/examples/skills_with_cube/run_agent.py new file mode 100644 index 0000000..017e651 --- /dev/null +++ b/examples/skills_with_cube/run_agent.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +""" +Example demonstrating the skills run flow in TRPC Agent framework. +""" +import asyncio +import json +import os +import uuid + +from dotenv import load_dotenv +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + +load_dotenv() + + +async def _kill_remote_sandbox_for_auto_recover_test(workspace_runtime) -> None: + """Delete the remote Cube sandbox without clearing the local client. + + This is intentionally test-only code: keeping the stale local handle lets + the next workspace operation hit SandboxNotFoundException and exercise the + CubeSandboxClient auto-recovery path. + """ + client = workspace_runtime._client # pylint: disable=protected-access + sandbox = client._require() # pylint: disable=protected-access + old_sandbox_id = sandbox.sandbox_id + await sandbox.kill() + print( + f"[skills_with_cube] killed remote Cube sandbox for auto-recover test: {old_sandbox_id}", + flush=True, + ) + + +async def run_skill_run_demo(): + """Run the skill run agent demo to demonstrate the various capabilities of an LLM agent.""" + + app_name = "skill_run_agent_demo" + + from agent.agent import create_agent + from agent.tools import build_cube_skill_run_payload + + root_agent, runtime_handle = await create_agent() + session_service = InMemorySessionService() + runner = Runner(app_name=app_name, agent=root_agent, session_service=session_service) + + user_id = "demo_user" + + cube_payload = build_cube_skill_run_payload( + skill_name="python-math", + inputs_host="/tmp/skillrun-inputs", + ) + cube_stage_inputs_request = f""" + Cube stage_inputs demonstration. + Please call skill_run once using this payload shape exactly: + {json.dumps(cube_payload, ensure_ascii=False)} + + Notes: + 1) The current runtime is Cube, so host:// inputs are uploaded into the remote sandbox. + 2) If artifact service is unavailable, continue with host://, workspace://, skill://. + 3) After running, explain which input schemes were staged successfully and include output file summaries. + """ + + demo_queries = [cube_stage_inputs_request] + recreate_between_runs = os.getenv("SKILLS_WITH_CUBE_RECREATE_BETWEEN_RUNS", "").lower() in {"1", "true", "yes"} + kill_between_runs = os.getenv("SKILLS_WITH_CUBE_KILL_BETWEEN_RUNS", "").lower() in {"1", "true", "yes"} + if recreate_between_runs or kill_between_runs: + demo_queries.append(cube_stage_inputs_request) + + try: + for idx, query in enumerate(demo_queries): + if idx == 1: + if kill_between_runs: + print("[skills_with_cube] killing Cube sandbox before the next request...", flush=True) + await _kill_remote_sandbox_for_auto_recover_test(runtime_handle) + else: + print("[skills_with_cube] recreating Cube sandbox before the next request...", flush=True) + await runtime_handle.recreate() + print(f"[skills_with_cube] using Cube sandbox: {runtime_handle.sandbox_id}", flush=True) + + current_session_id = str(uuid.uuid4()) + + print(f"šŸ†” Session ID: {current_session_id[:8]}...") + print(f"šŸ“ User: {query}") + + user_content = Content(parts=[Part.from_text(text=query)]) + + print("šŸ¤– Assistant: ", end="", flush=True) + async for event in runner.run_async(user_id=user_id, session_id=current_session_id, new_message=user_content): + if not event.content or not event.content.parts: + continue + + for part in event.content.parts: + if event.partial: + if part.text: + print(part.text, end="", flush=True) + continue + + if part.thought: + continue + if part.function_call: + print(f"\nšŸ”§ [Invoke Tool:: {part.function_call.name}({part.function_call.args})]") + elif part.function_response: + print(f"šŸ“Š [Tool Result: {part.function_response.response}]") + # elif part.text: + # print(f"\nāœ… {part.text}") + + print("\n" + "-" * 40) + finally: + await runner.close() + await runtime_handle.destroy() + + +if __name__ == "__main__": + os.system("echo 'hello from skillrun' > /tmp/skillrun-notes.txt") + os.system("echo 'this is another line' >> /tmp/skillrun-notes.txt") + os.system("mkdir -p /tmp/skillrun-inputs") + os.system("""cat > /tmp/skillrun-inputs/sales.csv << 'EOF' +region,amount +north,100 +south,200 +EOF +""") + # Create sample CSV file for data analysis skill + os.system("""cat > /tmp/sales_data.csv << 'EOF' +Date,Product,Sales,Quantity,Region +2024-01-01,Product A,1000,10,North +2024-01-02,Product B,1500,15,South +2024-01-03,Product A,1200,12,North +2024-01-04,Product C,800,8,East +2024-01-05,Product B,2000,20,South +2024-01-06,Product A,900,9,West +2024-01-07,Product C,1100,11,East +2024-01-08,Product B,1800,18,North +EOF +""") + asyncio.run(run_skill_run_demo()) + os.system("rm -rf /tmp/skillrun-inputs/*") + os.system("rm -rf /tmp/sales_data.csv") diff --git a/examples/skills_with_cube/skills/python-math/SKILL.md b/examples/skills_with_cube/skills/python-math/SKILL.md new file mode 100644 index 0000000..e6a53de --- /dev/null +++ b/examples/skills_with_cube/skills/python-math/SKILL.md @@ -0,0 +1,31 @@ +--- +name: python-math +description: Small Python utilities for math and text files. +--- + +Overview + +Run short Python scripts inside the skill workspace. Results can be +returned as text and saved as output files. + +Examples + +1) Print the first N Fibonacci numbers + + Command: + + python3 scripts/fib.py 10 > out/fib.txt + +2) Sum a list of integers + + Command: + + python3 - <<'PY' +from sys import stdin +nums = [int(x) for x in stdin.read().split()] +print(sum(nums)) +PY + +Output Files + +- out/fib.txt diff --git a/examples/skills_with_cube/skills/python-math/scripts/fib.py b/examples/skills_with_cube/skills/python-math/scripts/fib.py new file mode 100644 index 0000000..7f2933f --- /dev/null +++ b/examples/skills_with_cube/skills/python-math/scripts/fib.py @@ -0,0 +1,24 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. + +import sys + + +def fib(n: int): + a, b = 0, 1 + for _ in range(n): + print(a) + a, b = b, a + b + + +if __name__ == "__main__": + n = 10 + if len(sys.argv) > 1: + try: + n = int(sys.argv[1]) + except Exception: + n = 10 + fib(n) diff --git a/tests/code_executors/cube/test_bug_hunt.py b/tests/code_executors/cube/test_bug_hunt.py index a2c54f9..e3229c9 100644 --- a/tests/code_executors/cube/test_bug_hunt.py +++ b/tests/code_executors/cube/test_bug_hunt.py @@ -569,7 +569,8 @@ async def test_bug12_commands_run_translates_timeout_to_structured_result( side_effect=fake_e2b.TimeoutException() ) client = CubeSandboxClient( - fake_async_sandbox, idle_timeout=60, execute_timeout=30.0, + fake_async_sandbox, + CubeCodeExecutorConfig(template="t", api_url="u", api_key="k", idle_timeout=60, execute_timeout=30.0), ) result = await client.commands_run("sleep 9999", timeout=1.5) assert isinstance(result, CubeCommandResult) @@ -608,13 +609,11 @@ async def test_bug12_execute_code_surfaces_deadline_exceeded_outcome( fake_async_sandbox.commands.run = AsyncMock( side_effect=fake_e2b.TimeoutException() ) - client = CubeSandboxClient( - fake_async_sandbox, idle_timeout=60, execute_timeout=2.0, - ) cfg = CubeCodeExecutorConfig( template="t", api_url="u", api_key="k", idle_timeout=60, execute_timeout=2.0, ) + client = CubeSandboxClient(fake_async_sandbox, cfg) executor = CubeCodeExecutor(client, cfg) # execute_code MUST return a result, not raise. diff --git a/tests/code_executors/cube/test_code_executor.py b/tests/code_executors/cube/test_code_executor.py index 2b014ed..24d4e62 100644 --- a/tests/code_executors/cube/test_code_executor.py +++ b/tests/code_executors/cube/test_code_executor.py @@ -125,7 +125,7 @@ async def test_no_sandbox_id_opens_new(self, fake_e2b, monkeypatch, mock_client) monkeypatch.setattr( CubeSandboxClient, "open_existing", - classmethod(lambda cls, sid, cfg: open_existing(sid, cfg)), + classmethod(lambda cls, cfg: open_existing(cfg)), ) cfg = _cfg(sandbox_id=None) ex = await CubeCodeExecutor.create(cfg) @@ -143,12 +143,12 @@ async def test_with_sandbox_id_opens_existing(self, fake_e2b, monkeypatch, mock_ monkeypatch.setattr( CubeSandboxClient, "open_existing", - classmethod(lambda cls, sid, cfg: open_existing(sid, cfg)), + classmethod(lambda cls, cfg: open_existing(cfg)), ) cfg = _cfg(sandbox_id="sbx-42") await CubeCodeExecutor.create(cfg) open_new.assert_not_awaited() - open_existing.assert_awaited_once_with("sbx-42", cfg) + open_existing.assert_awaited_once_with(cfg) class TestAttach: @@ -164,11 +164,11 @@ async def test_with_sandbox_id_calls_open_existing(self, fake_e2b, monkeypatch, monkeypatch.setattr( CubeSandboxClient, "open_existing", - classmethod(lambda cls, sid, cfg: called(sid, cfg)), + classmethod(lambda cls, cfg: called(cfg)), ) cfg = _cfg(sandbox_id="sbx-1") ex = await CubeCodeExecutor.attach(cfg) - called.assert_awaited_once_with("sbx-1", cfg) + called.assert_awaited_once_with(cfg) assert ex.sandbox_client is mock_client @pytest.mark.asyncio @@ -183,7 +183,7 @@ async def test_never_calls_open_new(self, fake_e2b, monkeypatch): monkeypatch.setattr( CubeSandboxClient, "open_existing", - classmethod(lambda cls, sid, cfg: on_existing(sid, cfg)), + classmethod(lambda cls, cfg: on_existing(cfg)), ) with pytest.raises(RuntimeError, match="test stopper"): await CubeCodeExecutor.attach(_cfg(sandbox_id="sbx-1")) diff --git a/tests/code_executors/cube/test_runtime.py b/tests/code_executors/cube/test_runtime.py index d0719d4..c2144f5 100644 --- a/tests/code_executors/cube/test_runtime.py +++ b/tests/code_executors/cube/test_runtime.py @@ -8,6 +8,7 @@ from __future__ import annotations from pathlib import Path +from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock import pytest @@ -772,3 +773,45 @@ def test_provider_and_flag_forwarded(self, mock_client): rt = create_cube_workspace_runtime(ex, provider=provider, enable_provider_env=True) assert rt._runner._run_env_provider is provider assert rt._runner._enable_provider_env is True + + +class TestCubeWorkspaceRuntimeAutoRecover: + + @pytest.mark.asyncio + async def test_recreates_and_retries_when_sandbox_is_missing(self, fake_e2b, monkeypatch): + cfg = CubeCodeExecutorConfig(template="tpl", api_url="url", api_key="key", auto_recover=True) + sandbox1 = MagicMock() + sandbox1.sandbox_id = "old" + sandbox1.kill = AsyncMock(return_value=None) + sandbox1.set_timeout = AsyncMock(return_value=None) + sandbox1.commands = MagicMock() + sandbox1.commands.run = AsyncMock(side_effect=fake_e2b.SandboxNotFoundException("gone")) + client1 = CubeSandboxClient(sandbox1, cfg) + + sandbox2 = MagicMock() + sandbox2.sandbox_id = "new" + sandbox2.set_timeout = AsyncMock(return_value=None) + sandbox2.commands = MagicMock() + sandbox2.commands.run = AsyncMock(return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)) + client2 = CubeSandboxClient(sandbox2, cfg) + + executor1 = MagicMock() + executor1.config = cfg + executor1.sandbox_id = "old" + executor1.sandbox_client = client1 + + open_new = AsyncMock(return_value=client2) + monkeypatch.setattr(rt_mod.CubeSandboxClient, "open_new", open_new) + monkeypatch.setattr(rt_mod.time, "time_ns", lambda: 123) + + runtime = create_cube_workspace_runtime( + executor1, + workspace_cfg=CubeWorkspaceRuntimeConfig(), + ) + info = await runtime.manager().create_workspace("id") + + assert info.path == "/workspace/cube_agent/ws_id_123" + assert runtime.sandbox_id == "new" + open_new.assert_awaited_once_with(cfg) + sandbox1.kill.assert_awaited_once() + sandbox2.commands.run.assert_awaited_once() diff --git a/tests/code_executors/cube/test_sandbox.py b/tests/code_executors/cube/test_sandbox.py index 7ae806e..41a82df 100644 --- a/tests/code_executors/cube/test_sandbox.py +++ b/tests/code_executors/cube/test_sandbox.py @@ -15,6 +15,7 @@ from __future__ import annotations import asyncio +from dataclasses import replace from pathlib import Path from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock @@ -24,7 +25,7 @@ from trpc_agent_sdk.code_executors.cube import _sandbox from trpc_agent_sdk.code_executors.cube._sandbox import ( CubeCommandResult, - CubeSandboxClient, + CubeSandboxClient as _CubeSandboxClient, ) from trpc_agent_sdk.code_executors.cube._types import CubeCodeExecutorConfig @@ -41,6 +42,24 @@ def _cfg(**overrides) -> CubeCodeExecutorConfig: return CubeCodeExecutorConfig(**base) +class CubeSandboxClient(_CubeSandboxClient): + """Test adapter for constructing clients from timeout kwargs.""" + + def __init__(self, sandbox, cfg=None, *, idle_timeout=None, execute_timeout=None): + if cfg is None: + cfg = _cfg( + idle_timeout=idle_timeout if idle_timeout is not None else 60, + execute_timeout=execute_timeout if execute_timeout is not None else 30.0, + ) + super().__init__(sandbox, cfg) + + @classmethod + async def open_existing(cls, sandbox_id_or_cfg, cfg=None): + if cfg is None: + return await super().open_existing(sandbox_id_or_cfg) + return await super().open_existing(replace(cfg, sandbox_id=sandbox_id_or_cfg)) + + # --------------------------------------------------------------------------- # Construction & sandbox_id # --------------------------------------------------------------------------- diff --git a/tests/skills/stager/test_base_stager.py b/tests/skills/stager/test_base_stager.py index aecc774..bb2f0dc 100644 --- a/tests/skills/stager/test_base_stager.py +++ b/tests/skills/stager/test_base_stager.py @@ -64,6 +64,7 @@ def _make_repository(path="/skills/test-skill"): repo = MagicMock() repo.path = MagicMock(return_value=path) repo.workspace_runtime = _make_runtime() + repo.get_workspace_runtime = MagicMock(return_value=repo.workspace_runtime) return repo @@ -270,6 +271,23 @@ async def test_fresh_staging(self, mock_digest): result = await stager.stage_skill(request) assert result.workspace_skill_dir == "skills/test-skill" + @patch("trpc_agent_sdk.skills.stager._base_stager.compute_dir_digest", return_value="new_digest") + async def test_stage_skill_uses_repository_runtime(self, mock_digest): + stager = Stager() + repo = _make_repository() + request = _make_request(repo=repo) + runtime = repo.get_workspace_runtime.return_value + + mock_file = MagicMock() + mock_file.content = json.dumps({"version": 1, "skills": {}}) + runtime.fs(request.ctx).collect = AsyncMock(return_value=[mock_file]) + + result = await stager.stage_skill(request) + + assert result.workspace_skill_dir == "skills/test-skill" + repo.get_workspace_runtime.assert_called_once_with(request.ctx) + runtime.fs(request.ctx).stage_directory.assert_awaited_once() + @patch("trpc_agent_sdk.skills.stager._base_stager.compute_dir_digest", return_value="same_digest") async def test_cached_staging_with_links(self, mock_digest): stager = Stager() diff --git a/tests/skills/tools/test_copy_stager.py b/tests/skills/tools/test_copy_stager.py index fe02c80..5787680 100644 --- a/tests/skills/tools/test_copy_stager.py +++ b/tests/skills/tools/test_copy_stager.py @@ -128,6 +128,7 @@ async def test_stage_delegates_to_parent(self, mock_digest): runtime.fs = MagicMock(return_value=fs) runtime.runner = MagicMock(return_value=runner) repo.workspace_runtime = runtime + repo.get_workspace_runtime = MagicMock(return_value=runtime) from trpc_agent_sdk.skills.stager._types import SkillStageRequest request = SkillStageRequest( diff --git a/tests/skills/tools/test_skill_load.py b/tests/skills/tools/test_skill_load.py index 7c4987b..4d7e9d1 100644 --- a/tests/skills/tools/test_skill_load.py +++ b/tests/skills/tools/test_skill_load.py @@ -31,6 +31,7 @@ from trpc_agent_sdk.skills.tools._skill_load import ( SkillLoadTool, ) +from trpc_agent_sdk.skills.stager import SkillStageResult def _make_ctx(repository=None): @@ -115,6 +116,36 @@ def test_include_all_docs_sets_star(self): assert ctx.actions.state_delta[docs_state_key(ctx, "test-skill")] == "*" +class TestWorkspaceRuntimeResolver: + @pytest.mark.asyncio + async def test_ensure_staged_uses_repository_runtime(self): + repo_runtime = MagicMock() + repo = MagicMock() + repo.workspace_runtime = repo_runtime + + resolved_runtime = MagicMock() + manager = MagicMock() + manager.create_workspace = AsyncMock(return_value=MagicMock()) + resolved_runtime.manager = MagicMock(return_value=manager) + repo.get_workspace_runtime = MagicMock(return_value=resolved_runtime) + + stager = MagicMock() + stager.stage_skill = AsyncMock(return_value=SkillStageResult(workspace_skill_dir="skills/test-skill")) + + ctx = _make_ctx(repo) + tool = SkillLoadTool( + repository=repo, + skill_stager=stager, + create_ws_name_cb=lambda _: "ws", + ) + + await tool._ensure_staged(ctx=ctx, skill_name="test-skill") + + resolved_runtime.manager.assert_called_once_with(ctx) + repo.get_workspace_runtime.assert_called_once_with(ctx) + repo_runtime.manager.assert_not_called() + + # --------------------------------------------------------------------------- # _set_state_delta_for_skill_tools # --------------------------------------------------------------------------- diff --git a/tests/skills/tools/test_skill_run.py b/tests/skills/tools/test_skill_run.py index b1e43ae..a6c8e43 100644 --- a/tests/skills/tools/test_skill_run.py +++ b/tests/skills/tools/test_skill_run.py @@ -113,6 +113,19 @@ def test_get_repository(self): ctx = MagicMock() assert tool._get_repository(ctx) is repo + def test_repository_get_workspace_runtime_is_used(self): + repo_runtime = MagicMock() + repo = MagicMock() + repo.workspace_runtime = repo_runtime + resolved_runtime = MagicMock() + repo.get_workspace_runtime = MagicMock(return_value=resolved_runtime) + ctx = MagicMock() + + tool = SkillRunTool(repository=repo) + + assert tool._get_repository(ctx).get_workspace_runtime(ctx) is resolved_runtime + repo.get_workspace_runtime.assert_called_once_with(ctx) + def test_is_skill_loaded(self): tool = _make_tool() ctx = MagicMock() diff --git a/trpc_agent_sdk/code_executors/__init__.py b/trpc_agent_sdk/code_executors/__init__.py index 75c2f1f..eacd4e7 100644 --- a/trpc_agent_sdk/code_executors/__init__.py +++ b/trpc_agent_sdk/code_executors/__init__.py @@ -20,6 +20,7 @@ from ._base_workspace_runtime import DefaultWorkspace from ._base_workspace_runtime import new_default_workspace_runtime from ._base_workspace_runtime import WorkspaceRuntimeResolver +from ._base_workspace_runtime import get_workspace_runtime_with_resolver from ._code_executor_context import CodeExecutorContext from ._constants import DEFAULT_CREATE_TIMEOUT_SEC from ._constants import DEFAULT_FILE_MODE @@ -104,6 +105,7 @@ "DefaultWorkspace", "new_default_workspace_runtime", "WorkspaceRuntimeResolver", + "get_workspace_runtime_with_resolver", "CodeExecutorContext", "DEFAULT_CREATE_TIMEOUT_SEC", "DEFAULT_FILE_MODE", diff --git a/trpc_agent_sdk/code_executors/_base_workspace_runtime.py b/trpc_agent_sdk/code_executors/_base_workspace_runtime.py index 1ce6af3..fd6c897 100644 --- a/trpc_agent_sdk/code_executors/_base_workspace_runtime.py +++ b/trpc_agent_sdk/code_executors/_base_workspace_runtime.py @@ -536,3 +536,23 @@ def new_default_workspace_runtime( WorkspaceRuntimeResolver: TypeAlias = Callable[[InvocationContext], BaseWorkspaceRuntime] """Callback to resolve a workspace runtime.""" + + +def get_workspace_runtime_with_resolver( + ctx: InvocationContext, + resolver: Optional[WorkspaceRuntimeResolver] = None, + workspace_runtime: Optional[BaseWorkspaceRuntime] = None) -> BaseWorkspaceRuntime: + """ + Get workspace runtime. + Args: + ctx: InvocationContext + resolver: WorkspaceRuntimeResolver + workspace_runtime: Optional[BaseWorkspaceRuntime] + Returns: + BaseWorkspaceRuntime + """ + if resolver is not None: + workspace_runtime = resolver(ctx) + if workspace_runtime is None: + raise ValueError("Workspace runtime not found") + return workspace_runtime diff --git a/trpc_agent_sdk/code_executors/cube/__init__.py b/trpc_agent_sdk/code_executors/cube/__init__.py index 7cd114c..47fe50e 100644 --- a/trpc_agent_sdk/code_executors/cube/__init__.py +++ b/trpc_agent_sdk/code_executors/cube/__init__.py @@ -21,13 +21,17 @@ from ._runtime import create_cube_workspace_runtime from ._sandbox import CubeCommandResult from ._sandbox import CubeSandboxClient +from ._sandbox import create_cube_sandbox_client from ._transfer import OnExisting +from ._types import CubeClientConfig from ._types import CubeCodeExecutorConfig from ._types import CubeWorkspaceRuntimeConfig __all__ = [ "CubeCodeExecutor", + "CubeClientConfig", "CubeCodeExecutorConfig", + "create_cube_sandbox_client", "CubeCommandResult", "CubeProgramRunner", "CubeSandboxClient", diff --git a/trpc_agent_sdk/code_executors/cube/_code_executor.py b/trpc_agent_sdk/code_executors/cube/_code_executor.py index 967da10..5c4b3a4 100644 --- a/trpc_agent_sdk/code_executors/cube/_code_executor.py +++ b/trpc_agent_sdk/code_executors/cube/_code_executor.py @@ -29,6 +29,7 @@ from .._types import create_code_execution_result from ._sandbox import CubeCommandResult from ._sandbox import CubeSandboxClient +from ._sandbox import create_cube_sandbox_client from ._types import CubeCodeExecutorConfig _PYTHON_LANGUAGES = frozenset({"python", "py", "python3", ""}) @@ -110,10 +111,7 @@ def __init__( @classmethod async def create(cls, cfg: CubeCodeExecutorConfig) -> "CubeCodeExecutor": """Strict factory. Attaches when ``cfg.sandbox_id`` is set, else creates.""" - if cfg.sandbox_id: - client = await CubeSandboxClient.open_existing(cfg.sandbox_id, cfg) - else: - client = await CubeSandboxClient.open_new(cfg) + client = await create_cube_sandbox_client(cfg) return cls(client, cfg) @classmethod @@ -126,7 +124,7 @@ async def attach(cls, cfg: CubeCodeExecutorConfig) -> "CubeCodeExecutor": if not cfg.sandbox_id: raise ValueError("CubeCodeExecutor.attach requires cfg.sandbox_id to be set; " "use CubeCodeExecutor.create(cfg) to create a fresh sandbox.") - client = await CubeSandboxClient.open_existing(cfg.sandbox_id, cfg) + client = await CubeSandboxClient.open_existing(cfg) return cls(client, cfg) @classmethod @@ -144,14 +142,13 @@ async def create_or_recreate( created. PAUSED state and other errors propagate unchanged so that operator-managed pauses are not silently overwritten. """ - if not cfg.sandbox_id: - return await cls.create(cfg) try: return await cls.create(cfg) except e2b.SandboxNotFoundException: if on_stale is not None: await on_stale() - return await cls.create(replace(cfg, sandbox_id=None)) + cfg = replace(cfg, sandbox_id=None) + return await cls.create(cfg) @property def sandbox_id(self) -> str: diff --git a/trpc_agent_sdk/code_executors/cube/_runtime.py b/trpc_agent_sdk/code_executors/cube/_runtime.py index 8056d3f..c9c0155 100644 --- a/trpc_agent_sdk/code_executors/cube/_runtime.py +++ b/trpc_agent_sdk/code_executors/cube/_runtime.py @@ -54,6 +54,7 @@ from ._paths import shell_quote from ._sandbox import CubeSandboxClient from ._types import CubeWorkspaceRuntimeConfig +from ._types import DEFAULT_EXECUTE_TIMEOUT _RE_SAFE_ID = re.compile(r"[^a-zA-Z0-9_-]") @@ -414,15 +415,28 @@ def __init__( enable_provider_env: bool = False, ): self._client = client - self._fs = CubeWorkspaceFS(client, execute_timeout) - self._manager = CubeWorkspaceManager(client, remote_workspace, execute_timeout) + self._fs = CubeWorkspaceFS(self._client, execute_timeout) + self._manager = CubeWorkspaceManager(self._client, remote_workspace, execute_timeout) self._runner = CubeProgramRunner( - client, + self._client, execute_timeout, provider=provider, enable_provider_env=enable_provider_env, ) + @property + def sandbox_id(self) -> str | None: + """Current Cube sandbox id.""" + return self._client.sandbox_id + + async def recreate(self) -> None: + """Force sandbox recreation when the client supports it.""" + await self._client.recreate() + + async def destroy(self) -> None: + """Destroy the current Cube sandbox/client.""" + await self._client.destroy() + @override def manager(self, ctx: Optional[InvocationContext] = None) -> CubeWorkspaceManager: return self._manager @@ -446,8 +460,9 @@ def describe(self, ctx: Optional[InvocationContext] = None) -> WorkspaceCapabili def create_cube_workspace_runtime( - executor: CubeCodeExecutor, - *, + executor: CubeCodeExecutor | None = None, + sandbox_client: CubeSandboxClient | None = None, + execute_timeout: float = DEFAULT_EXECUTE_TIMEOUT, workspace_cfg: Optional[CubeWorkspaceRuntimeConfig] = None, provider: Optional[RunEnvProvider] = None, enable_provider_env: bool = False, @@ -466,13 +481,26 @@ def create_cube_workspace_runtime( For lower-level integrations, construct :class:`CubeWorkspaceRuntime` directly with an explicit client + ``remote_workspace`` + ``execute_timeout``. + Args: + executor: CubeCodeExecutor instance, will deprecated, will be removed in the future + sandbox_client: CubeSandboxClient instance, required + execute_timeout: execute timeout, default to DEFAULT_EXECUTE_TIMEOUT + workspace_cfg: workspace configuration, default to CubeWorkspaceRuntimeConfig() + provider: provider, default to None + enable_provider_env: enable provider environment, default to False + Returns: + CubeWorkspaceRuntime instance """ + if executor: + sandbox_client = executor.sandbox_client + execute_timeout = executor.config.execute_timeout + if not sandbox_client: + raise ValueError("sandbox_client is required") ws_cfg = workspace_cfg or CubeWorkspaceRuntimeConfig() - exec_cfg = executor.config return CubeWorkspaceRuntime( - executor.sandbox_client, + sandbox_client, remote_workspace=ws_cfg.remote_workspace, - execute_timeout=exec_cfg.execute_timeout, + execute_timeout=execute_timeout, provider=provider, enable_provider_env=enable_provider_env, ) diff --git a/trpc_agent_sdk/code_executors/cube/_sandbox.py b/trpc_agent_sdk/code_executors/cube/_sandbox.py index 327e15c..e6d432e 100644 --- a/trpc_agent_sdk/code_executors/cube/_sandbox.py +++ b/trpc_agent_sdk/code_executors/cube/_sandbox.py @@ -37,10 +37,14 @@ import asyncio from dataclasses import dataclass +from dataclasses import replace from pathlib import Path from typing import Any +from typing import Awaitable +from typing import Callable from typing import Mapping from typing import Optional +from typing import TypeVar import e2b_code_interpreter as e2b from e2b_code_interpreter import AsyncSandbox @@ -52,12 +56,21 @@ from ._transfer import download_directory_via_tar from ._transfer import reserve_local_destination from ._transfer import upload_directory_via_tar -from ._types import CubeCodeExecutorConfig +from ._types import CubeClientConfig # The unix user we run sandbox commands and FS ops as. Standard cube/e2b # templates ship with `root`; downstream callers do not need to override # this and we deliberately do not expose a knob to keep the surface small. _GUEST_USER = "root" +_T = TypeVar("_T") + + +def _is_stale_sandbox_error(exc: BaseException) -> bool: + """Return whether ``exc`` means the remote sandbox disappeared.""" + if isinstance(exc, e2b.SandboxNotFoundException): + return True + message = str(exc).lower() + return "code.unknown" in message and "requested resource does not exist" in message @dataclass @@ -112,18 +125,20 @@ class CubeSandboxClient: the constructor directly. """ - def __init__(self, sandbox: AsyncSandbox, *, idle_timeout: int, execute_timeout: float): + def __init__(self, sandbox: AsyncSandbox, cfg: CubeClientConfig): self._sbx: Optional[AsyncSandbox] = sandbox - self._idle_timeout = idle_timeout - self._execute_timeout = execute_timeout + self._cfg = cfg + self._recreate_cfg = replace(cfg, sandbox_id=None) + self._idle_timeout = cfg.idle_timeout + self._execute_timeout = cfg.execute_timeout + self._recreate_lock = asyncio.Lock() @property def sandbox_id(self) -> str: - sbx = self._require() - return sbx.sandbox_id + return self._require().sandbox_id @classmethod - async def open_new(cls, cfg: CubeCodeExecutorConfig) -> "CubeSandboxClient": + async def open_new(cls, cfg: CubeClientConfig) -> "CubeSandboxClient": """Create a brand-new remote sandbox.""" sbx = await e2b.AsyncSandbox.create( template=cfg.resolve_template(), @@ -131,10 +146,13 @@ async def open_new(cls, cfg: CubeCodeExecutorConfig) -> "CubeSandboxClient": api_key=cfg.resolve_api_key(), timeout=cfg.idle_timeout, ) - return cls(sbx, idle_timeout=cfg.idle_timeout, execute_timeout=cfg.execute_timeout) + return cls(sbx, cfg) @classmethod - async def open_existing(cls, sandbox_id: str, cfg: CubeCodeExecutorConfig) -> "CubeSandboxClient": + async def open_existing( + cls, + cfg: CubeClientConfig, + ) -> "CubeSandboxClient": """Attach to an existing remote sandbox and assert it is RUNNING. Raises: @@ -144,12 +162,14 @@ async def open_existing(cls, sandbox_id: str, cfg: CubeCodeExecutorConfig) -> "C PAUSED); caller should not silently overwrite locator state. """ + if not cfg.sandbox_id: + raise ValueError("CubeSandboxClient.open_existing requires cfg.sandbox_id") sbx = await e2b.AsyncSandbox.connect( - sandbox_id, + cfg.sandbox_id, api_url=cfg.resolve_api_url(), api_key=cfg.resolve_api_key(), ) - client = cls(sbx, idle_timeout=cfg.idle_timeout, execute_timeout=cfg.execute_timeout) + client = cls(sbx, cfg) await client.assert_running() return client @@ -180,6 +200,11 @@ async def destroy(self) -> None: finally: self._sbx = None + async def recreate(self) -> None: + """Explicitly replace the current sandbox with a fresh one.""" + async with self._recreate_lock: + await self._recreate_locked() + async def assert_running(self) -> None: """Verify the sandbox is RUNNING; reject PAUSED and surface stale ids. @@ -188,11 +213,7 @@ async def assert_running(self) -> None: - PAUSED state raises :class:`SandboxException` so callers do not silently discard operator-managed pause state. """ - sbx = self._require() - info = await sbx.get_info(request_timeout=self._execute_timeout) - if info.state != e2b.SandboxState.RUNNING: - raise e2b.SandboxException(f"Cube sandbox {sbx.sandbox_id} is in state {info.state.value!r}, " - f"expected {e2b.SandboxState.RUNNING.value!r}.") + await self._with_recovery(self._assert_running_once) async def set_timeout(self, seconds: int) -> None: """Best-effort idle-timeout renewal. @@ -202,11 +223,7 @@ async def set_timeout(self, seconds: int) -> None: truncated by ``int(...)`` (e.g. ``0.9`` → ``0``, which most vendor APIs interpret as "no timeout" / "expire immediately"). """ - sbx = self._require() - try: - await sbx.set_timeout(seconds) - except Exception as exc: # pylint: disable=broad-exception-caught - logger.debug("Cube sandbox %s set_timeout failed: %s", sbx.sandbox_id, exc) + await self._with_recovery(lambda: self._set_timeout_once(seconds)) async def commands_run( self, @@ -228,59 +245,13 @@ async def commands_run( provided) is encoded as a bash heredoc because the e2b SDK's ``stdin`` flag is not a data channel. """ - sbx = self._require() - if stdin is not None: - command = wrap_stdin_heredoc(command, stdin) - timeout_sec = float(timeout if timeout is not None else self._execute_timeout) - kwargs: dict[str, Any] = { - "envs": dict(env or {}), - "user": _GUEST_USER, - "timeout": timeout_sec, - } - if cwd: - kwargs["cwd"] = cwd - - loop = asyncio.get_running_loop() - start = loop.time() - timed_out = False - try: - result = await sbx.commands.run(command, **kwargs) - except e2b.CommandExitException as exc: - result = exc - except BaseException as exc: - # Timeouts surface here as one of several types depending on - # which transport layer fires first: - # - e2b.TimeoutException (vendor SDK layer) - # - httpcore.ReadTimeout / httpcore.TimeoutException - # (transport layer — can race ahead of the e2b mapping on - # slow Cube deployments) - # The httpcore path is only reachable via the transitive - # dependency, so we match by type-name instead of importing - # httpcore just to subclass-check. We still re-raise anything - # that is not timeout-flavoured so real errors stay visible. - name = type(exc).__name__ - if "Timeout" not in name: - raise - result = None - timed_out = True - duration = loop.time() - start - - await self.set_timeout(self._idle_timeout) - - if timed_out: - return CubeCommandResult( - stdout="", - stderr=f"Command timed out after {timeout_sec:g}s", - exit_code=-1, - duration=float(duration), - timed_out=True, - ) - return CubeCommandResult( - stdout=str(getattr(result, "stdout", "") or ""), - stderr=str(getattr(result, "stderr", "") or ""), - exit_code=int(getattr(result, "exit_code", 0) or 0), - duration=float(duration), - ) + return await self._with_recovery(lambda: self._commands_run_once( + command, + cwd=cwd, + env=env, + stdin=stdin, + timeout=timeout, + )) async def upload_path(self, local: Path, remote_abs: str) -> None: """Upload a host file or directory to an absolute remote path. @@ -330,22 +301,125 @@ async def download_path( async def read_file_bytes(self, remote_abs: str) -> bytes: """Read a remote file's raw bytes.""" - sbx = self._require() - data = await sbx.files.read(remote_abs, format="bytes", user=_GUEST_USER) + data = await self._with_recovery( + lambda: self._require().files.read(remote_abs, format="bytes", user=_GUEST_USER)) return data if isinstance(data, bytes) else bytes(data or b"") async def write_file_bytes(self, remote_abs: str, data: bytes) -> None: """Write raw bytes to a remote file.""" - sbx = self._require() - await sbx.files.write(remote_abs, data, user=_GUEST_USER) + await self._with_recovery(lambda: self._require().files.write(remote_abs, data, user=_GUEST_USER)) async def _is_remote_dir(self, remote_abs: str) -> bool: """Return whether ``remote_abs`` resolves to a directory inside the sandbox.""" - sbx = self._require() - info = await sbx.files.get_info(remote_abs, user=_GUEST_USER) + info = await self._with_recovery(lambda: self._require().files.get_info(remote_abs, user=_GUEST_USER)) return info.type == e2b.FileType.DIR + async def _assert_running_once(self) -> None: + sbx = self._require() + info = await sbx.get_info(request_timeout=self._execute_timeout) + if info.state != e2b.SandboxState.RUNNING: + raise e2b.SandboxException(f"Cube sandbox {sbx.sandbox_id} is in state {info.state.value!r}, " + f"expected {e2b.SandboxState.RUNNING.value!r}.") + + async def _set_timeout_once(self, seconds: int) -> None: + sbx = self._require() + try: + await sbx.set_timeout(seconds) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.debug("Cube sandbox %s set_timeout failed: %s", sbx.sandbox_id, exc) + + async def _commands_run_once( + self, + command: str, + *, + cwd: Optional[str] = None, + env: Optional[Mapping[str, str]] = None, + stdin: Optional[bytes] = None, + timeout: Optional[float] = None, + ) -> CubeCommandResult: + sbx = self._require() + if stdin is not None: + command = wrap_stdin_heredoc(command, stdin) + timeout_sec = float(timeout if timeout is not None else self._execute_timeout) + kwargs: dict[str, Any] = { + "envs": dict(env or {}), + "user": _GUEST_USER, + "timeout": timeout_sec, + } + if cwd: + kwargs["cwd"] = cwd + + loop = asyncio.get_running_loop() + start = loop.time() + timed_out = False + try: + result = await sbx.commands.run(command, **kwargs) + except e2b.CommandExitException as exc: + result = exc + except BaseException as exc: + # Timeouts surface here as one of several types depending on + # which transport layer fires first: + # - e2b.TimeoutException (vendor SDK layer) + # - httpcore.ReadTimeout / httpcore.TimeoutException + # (transport layer — can race ahead of the e2b mapping on + # slow Cube deployments) + # The httpcore path is only reachable via the transitive + # dependency, so we match by type-name instead of importing + # httpcore just to subclass-check. We still re-raise anything + # that is not timeout-flavoured so real errors stay visible. + name = type(exc).__name__ + if "Timeout" not in name: + raise + result = None + timed_out = True + duration = loop.time() - start + + await self.set_timeout(self._idle_timeout) + + if timed_out: + return CubeCommandResult( + stdout="", + stderr=f"Command timed out after {timeout_sec:g}s", + exit_code=-1, + duration=float(duration), + timed_out=True, + ) + return CubeCommandResult( + stdout=str(getattr(result, "stdout", "") or ""), + stderr=str(getattr(result, "stderr", "") or ""), + exit_code=int(getattr(result, "exit_code", 0) or 0), + duration=float(duration), + ) + + async def _with_recovery(self, op: Callable[[], Awaitable[_T]]) -> _T: + sandbox = self._sbx + try: + return await op() + except Exception as exc: + if not self._cfg.auto_recover or not _is_stale_sandbox_error(exc): + raise + logger.info("Cube sandbox expired; recreating sandbox client: %s", exc) + async with self._recreate_lock: + if self._sbx is sandbox: + await self._recreate_locked() + return await op() + + async def _recreate_locked(self) -> None: + if self._sbx is not None: + await self.destroy() + fresh = await type(self).open_new(self._recreate_cfg) + self._sbx = fresh._require() + fresh.close() + logger.info("Cube sandbox client using sandbox: %s", self.sandbox_id) + def _require(self) -> AsyncSandbox: if self._sbx is None: raise RuntimeError("CubeSandboxClient is closed.") return self._sbx + + +async def create_cube_sandbox_client(cfg: CubeClientConfig) -> CubeSandboxClient: + """Create or attach a Cube sandbox client from config.""" + if cfg.sandbox_id: + return await CubeSandboxClient.open_existing(cfg) + return await CubeSandboxClient.open_new(cfg) diff --git a/trpc_agent_sdk/code_executors/cube/_types.py b/trpc_agent_sdk/code_executors/cube/_types.py index 261649b..4d25c60 100644 --- a/trpc_agent_sdk/code_executors/cube/_types.py +++ b/trpc_agent_sdk/code_executors/cube/_types.py @@ -21,17 +21,15 @@ @dataclass -class CubeCodeExecutorConfig: - """Configuration for :class:`CubeCodeExecutor`. +class CubeClientConfig: + """Configuration for :class:`CubeSandboxClient`. Holds only the sandbox-lifecycle and command-execution settings the - bare code executor consumes. Workspace-runtime knobs (e.g. the + bare sandbox client consumes. Workspace-runtime knobs (e.g. the remote workspace root) live in :class:`CubeWorkspaceRuntimeConfig` - so executor-only callers never see fields they don't use (ISP). + so client-only callers never see fields they don't use (ISP). - The optional ``e2b-code-interpreter`` dependency must be installed - (it transitively pulls in ``e2b``). Credentials may be supplied here - or through ``E2B_API_URL`` / ``E2B_API_KEY``. The Cube template id + Credentials may be supplied here or through ``E2B_API_URL`` / ``E2B_API_KEY``. The Cube template id may be supplied here or through ``CUBE_TEMPLATE_ID``. """ @@ -47,6 +45,14 @@ class CubeCodeExecutorConfig: sandbox_id: Optional[str] = None """Existing remote sandbox id. When set, factories attach instead of create.""" + auto_recover: bool = False + """Whether ``CubeSandboxClient`` should recreate expired sandboxes. + + Disabled by default to preserve the original lifecycle contract. When + enabled, sandbox operations recreate a fresh sandbox after + ``SandboxNotFoundException`` and retry the failed operation once. + """ + execute_timeout: float = DEFAULT_EXECUTE_TIMEOUT """Default per-command timeout in seconds. @@ -99,6 +105,10 @@ def resolve_api_key(self) -> str: return value +# Deprecated, will be removed in the future +CubeCodeExecutorConfig = CubeClientConfig + + @dataclass class CubeWorkspaceRuntimeConfig: """Configuration for :class:`CubeWorkspaceRuntime`. diff --git a/trpc_agent_sdk/skills/_repository.py b/trpc_agent_sdk/skills/_repository.py index 3658287..38fd71f 100644 --- a/trpc_agent_sdk/skills/_repository.py +++ b/trpc_agent_sdk/skills/_repository.py @@ -27,6 +27,8 @@ import yaml from trpc_agent_sdk.context import InvocationContext from trpc_agent_sdk.code_executors import BaseWorkspaceRuntime +from trpc_agent_sdk.code_executors import WorkspaceRuntimeResolver +from trpc_agent_sdk.code_executors import get_workspace_runtime_with_resolver from trpc_agent_sdk.code_executors import create_local_workspace_runtime from trpc_agent_sdk.log import logger @@ -51,14 +53,21 @@ class BaseSkillRepository(abc.ABC): must satisfy. Parsing internals are left entirely to subclasses. """ - def __init__(self, workspace_runtime: BaseWorkspaceRuntime, visibility_filter: VisibilityFilter | None = None): + def __init__(self, + workspace_runtime: BaseWorkspaceRuntime, + visibility_filter: VisibilityFilter | None = None, + workspace_runtime_resolver: Optional[WorkspaceRuntimeResolver] = None): self._workspace_runtime = workspace_runtime self._visibility_filter = visibility_filter + self._workspace_runtime_resolver = workspace_runtime_resolver @property def workspace_runtime(self) -> BaseWorkspaceRuntime: return self._workspace_runtime + def get_workspace_runtime(self, ctx: InvocationContext) -> BaseWorkspaceRuntime: + return get_workspace_runtime_with_resolver(ctx, self._workspace_runtime_resolver, self._workspace_runtime) + @property def visibility_filter(self) -> VisibilityFilter | None: """Return the filter function.""" @@ -136,6 +145,7 @@ def __init__( self, *roots: str, workspace_runtime: Optional[BaseWorkspaceRuntime] = None, + workspace_runtime_resolver: Optional[WorkspaceRuntimeResolver] = None, resolver: Optional[SkillRootResolver] = None, enable_hot_reload: bool = False, ): @@ -151,7 +161,7 @@ def __init__( """ if workspace_runtime is None: workspace_runtime = create_local_workspace_runtime() - super().__init__(workspace_runtime) + super().__init__(workspace_runtime, workspace_runtime_resolver=workspace_runtime_resolver) self._resolver = resolver or SkillRootResolver() self._skill_paths: dict[str, str] = {} # name -> base dir self._all_descriptions: dict[str, str] = {} # name -> description diff --git a/trpc_agent_sdk/skills/_toolset.py b/trpc_agent_sdk/skills/_toolset.py index cdaab4d..d1d186e 100644 --- a/trpc_agent_sdk/skills/_toolset.py +++ b/trpc_agent_sdk/skills/_toolset.py @@ -100,6 +100,7 @@ def __init__(self, self._repository = repository or FsSkillRepository( *(paths or []), enable_hot_reload=enable_hot_reload, + workspace_runtime_resolver=workspace_runtime_resolver, ) self._skill_config = skill_config or DEFAULT_SKILL_CONFIG self._create_ws_name_cb = create_ws_name_cb or default_create_ws_name_callback diff --git a/trpc_agent_sdk/skills/stager/_base_stager.py b/trpc_agent_sdk/skills/stager/_base_stager.py index 3119193..2775912 100644 --- a/trpc_agent_sdk/skills/stager/_base_stager.py +++ b/trpc_agent_sdk/skills/stager/_base_stager.py @@ -79,7 +79,7 @@ async def stage_skill(self, request: SkillStageRequest) -> SkillStageResult: ctx = request.ctx ws = request.workspace root = request.repository.path(request.skill_name) - runtime = request.repository.workspace_runtime + runtime = request.repository.get_workspace_runtime(ctx) name = request.skill_name digest = compute_dir_digest(root) md = await self.load_workspace_metadata(ctx, runtime, ws) diff --git a/trpc_agent_sdk/skills/tools/_skill_exec.py b/trpc_agent_sdk/skills/tools/_skill_exec.py index b9f6eb6..0b0a428 100644 --- a/trpc_agent_sdk/skills/tools/_skill_exec.py +++ b/trpc_agent_sdk/skills/tools/_skill_exec.py @@ -419,7 +419,7 @@ async def _run_async_impl( repository = self._run_tool._get_repository(tool_context) # Workspace creation - workspace_runtime = repository.workspace_runtime + workspace_runtime = repository.get_workspace_runtime(tool_context) manager = workspace_runtime.manager(tool_context) workspace_id = self._create_ws_name_cb(tool_context) ws = await manager.create_workspace(workspace_id, tool_context) diff --git a/trpc_agent_sdk/skills/tools/_skill_load.py b/trpc_agent_sdk/skills/tools/_skill_load.py index 869f90d..eddf57a 100644 --- a/trpc_agent_sdk/skills/tools/_skill_load.py +++ b/trpc_agent_sdk/skills/tools/_skill_load.py @@ -107,7 +107,7 @@ async def _run_async_impl(self, *, tool_context: InvocationContext, args: dict[s async def _ensure_staged(self, *, ctx: InvocationContext, skill_name: str) -> None: repository = self._get_repository(ctx) - runtime = repository.workspace_runtime + runtime = repository.get_workspace_runtime(ctx) manager = runtime.manager(ctx) ws_id = self._create_ws_name_cb(ctx) ws = await manager.create_workspace(ws_id, ctx) diff --git a/trpc_agent_sdk/skills/tools/_skill_run.py b/trpc_agent_sdk/skills/tools/_skill_run.py index 8760f82..0102d9a 100644 --- a/trpc_agent_sdk/skills/tools/_skill_run.py +++ b/trpc_agent_sdk/skills/tools/_skill_run.py @@ -18,6 +18,7 @@ from pydantic import BaseModel from pydantic import Field +from trpc_agent_sdk.code_executors import BaseWorkspaceRuntime from trpc_agent_sdk.code_executors import CodeFile from trpc_agent_sdk.code_executors import DIR_OUT from trpc_agent_sdk.code_executors import DIR_SKILLS @@ -507,6 +508,7 @@ async def _prepare_editor_env( self, ctx: InvocationContext, ws: WorkspaceInfo, + workspace_runtime: BaseWorkspaceRuntime, env: dict[str, str], editor_text: str, ) -> None: @@ -526,7 +528,6 @@ async def _prepare_editor_env( # Try using workspace FS (works for container runtimes too) try: script_content = _build_editor_wrapper_script(content_abs) - workspace_runtime = self._get_repository(ctx).workspace_runtime fs = workspace_runtime.fs(ctx) await fs.put_files( ws, @@ -619,10 +620,10 @@ async def _auto_export_workspace_out( self, ctx: InvocationContext, ws: WorkspaceInfo, + workspace_runtime: BaseWorkspaceRuntime, ) -> list[CodeFile]: """Collect up to _AUTO_EXPORT_MAX files from out/** automatically.""" try: - workspace_runtime = self._get_repository(ctx).workspace_runtime fs = workspace_runtime.fs(ctx) files = await fs.collect(ws, [_AUTO_EXPORT_PATTERN], ctx) if not files: @@ -669,7 +670,7 @@ async def _run_async_impl( repository = self._get_repository(tool_context) workspace_id = self._create_ws_name_cb(tool_context) - workspace_runtime = repository.workspace_runtime + workspace_runtime = repository.get_workspace_runtime(tool_context) manager = workspace_runtime.manager(tool_context) ws = await manager.create_workspace(workspace_id, tool_context) @@ -695,15 +696,15 @@ async def _run_async_impl( await fs.stage_inputs(ws, inputs.inputs, tool_context) cwd = self._resolve_cwd(inputs.cwd, workspace_skill_dir) - result = await self._run_program(tool_context, ws, cwd, inputs) + result = await self._run_program(tool_context, ws, workspace_runtime, cwd, inputs) # Collect explicit outputs files: list[SkillRunFile] - files, manifest = await self._prepare_outputs(tool_context, ws, inputs) + files, manifest = await self._prepare_outputs(tool_context, ws, workspace_runtime, inputs) # Auto-export out/** only when no explicit outputs requested if not files and manifest is None and not inputs.outputs and not inputs.output_files: - auto_raw = await self._auto_export_workspace_out(tool_context, ws) + auto_raw = await self._auto_export_workspace_out(tool_context, ws, workspace_runtime) if auto_raw: files = self._to_run_files(auto_raw) @@ -756,6 +757,7 @@ async def _run_program( self, ctx: InvocationContext, ws: WorkspaceInfo, + workspace_runtime: BaseWorkspaceRuntime, cwd: str, input_data: SkillRunInput, ) -> WorkspaceRunResult: @@ -784,12 +786,11 @@ async def _run_program( pass # Stage editor helper if requested - await self._prepare_editor_env(ctx, ws, env, input_data.editor_text) + await self._prepare_editor_env(ctx, ws, workspace_runtime, env, input_data.editor_text) # Build command (with venv activation or command restrictions) cmd, cmd_args = self._build_command(input_data.command, ws.path, cwd) - workspace_runtime = repository.workspace_runtime runner = workspace_runtime.runner(ctx) ret = await runner.run_program( ws, @@ -843,10 +844,10 @@ async def _prepare_outputs( self, ctx: InvocationContext, ws: WorkspaceInfo, + workspace_runtime: BaseWorkspaceRuntime, input_data: SkillRunInput, ) -> tuple[list[SkillRunFile], Optional[ManifestOutput]]: """Collect files via OutputSpec or legacy output_files patterns.""" - workspace_runtime = self._get_repository(ctx).workspace_runtime fs = workspace_runtime.fs(ctx) if input_data.outputs and not input_data.output_files: diff --git a/trpc_agent_sdk/skills/tools/_workspace_exec.py b/trpc_agent_sdk/skills/tools/_workspace_exec.py index 2d60830..544cc27 100644 --- a/trpc_agent_sdk/skills/tools/_workspace_exec.py +++ b/trpc_agent_sdk/skills/tools/_workspace_exec.py @@ -20,6 +20,7 @@ from trpc_agent_sdk.code_executors import BaseProgramSession from trpc_agent_sdk.code_executors import BaseWorkspaceRuntime from trpc_agent_sdk.code_executors import WorkspaceRuntimeResolver +from trpc_agent_sdk.code_executors import get_workspace_runtime_with_resolver from trpc_agent_sdk.code_executors import DEFAULT_EXEC_YIELD_MS from trpc_agent_sdk.code_executors import DEFAULT_SESSION_KILL_SEC from trpc_agent_sdk.code_executors import DEFAULT_SESSION_TTL_SEC @@ -192,9 +193,7 @@ def __init__( self._sessions: dict[str, _ExecSession] = {} def _runtime(self, ctx: InvocationContext) -> BaseWorkspaceRuntime: - if self._workspace_runtime_resolver is not None: - return self._workspace_runtime_resolver(ctx) - return self._workspace_runtime + return get_workspace_runtime_with_resolver(ctx, self._workspace_runtime_resolver, self._workspace_runtime) async def _workspace(self, ctx: InvocationContext) -> tuple[BaseWorkspaceRuntime, WorkspaceInfo]: runtime = self._runtime(ctx) From a44a9b6f3b0810152673a06b6f158e45bf196b52 Mon Sep 17 00:00:00 2001 From: raychen <815315825@qq.com> Date: Wed, 3 Jun 2026 15:18:24 +0800 Subject: [PATCH 7/7] =?UTF-8?q?version:=20=E5=8F=91=E5=B8=831.1.6=E7=89=88?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 17 +++++++++++++++-- tests/test_version.py | 2 +- trpc_agent_sdk/version.py | 2 +- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7da6ff..14d17ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ # Changelog +## [1.1.6](https://github.com/trpc-group/trpc-agent-python/releases/tag/v1.1.6) (2026-06-03) + +### Features + +* Skill: Added a recoverable Cube sandbox runtime for skills, including `CubeClientConfig`, a unified `create_cube_sandbox_client` entry point, optional `auto_recover` support in `CubeSandboxClient`, sandbox lifecycle helpers, and direct `CubeWorkspaceRuntime` creation from the client. +* Skill: Unified skill load/run/exec/stager paths around repository-level workspace runtime resolution via `repository.get_workspace_runtime(ctx)`, so tools under the same skill repository share one workspace runtime context. +* MCP: Added MCP tool caching to avoid repeated network access. +* Tools: Added `GraphAgent` support in `AgentTool`, allowing wrapped graph agents to return results from tool context state. +* Examples/Eval: Restored evaluation examples that were previously removed during open-source cleanup. +* Optimizer: Added support for the prompt self-optimization `AgentOptimizer`. + +### Bug Fixes + +* Storage: Fixed frequent sqlite warnings in `SqlSessionService` by consistently using database-side `func.now()` for update timestamps. + ## [1.1.5](https://github.com/trpc-group/trpc-agent-python/releases/tag/v1.1.5) (2026-05-19) ### Features @@ -23,8 +38,6 @@ * Tools: Removed default `mempalace_tool` exports from `trpc_agent_sdk.tools` to avoid forcing MemPalace optional dependencies during base package import. - - ## [1.1.3](https://github.com/trpc-group/trpc-agent-python/releases/tag/v1.1.3) (2026-05-12) ### Features diff --git a/tests/test_version.py b/tests/test_version.py index 15f991c..496cdd7 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -10,4 +10,4 @@ def test_version(): """Test the version module.""" - assert __version__ == '1.1.5' + assert __version__ == '1.1.6' diff --git a/trpc_agent_sdk/version.py b/trpc_agent_sdk/version.py index b1c2cae..e236095 100644 --- a/trpc_agent_sdk/version.py +++ b/trpc_agent_sdk/version.py @@ -9,4 +9,4 @@ This module defines the version information for TRPC Agent """ -__version__ = '1.1.5' +__version__ = '1.1.6'