Multi_Agent_Design/mafbench.html at main · CoDS-GCS/Multi_Agent_Design · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MAFBench | Unified Benchmark for Multi-Agent LLM Frameworks</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <link rel="stylesheet" href="styles/main.css">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
</head>
<body class="bg-white">
    <!-- Navigation -->
    <nav id="navbar" class="fixed top-0 w-full bg-white/95 backdrop-blur-sm shadow-sm z-50">
        <div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
            <div class="flex justify-between items-center h-16">
                <div class="flex items-center">
                    <a href="index.html" class="text-2xl font-bold text-indigo-600">MAFBench</a>
                </div>
                <div class="hidden md:flex space-x-8 items-center">
                    <a href="index.html" class="text-gray-600 hover:text-indigo-600">Home</a>
                    <a href="design.html" class="text-gray-600 hover:text-indigo-600">Why Design Matters</a>
                    <div class="dropdown relative">
                        <a href="architecture/index.html" class="text-gray-600 hover:text-indigo-600 flex items-center">
                            Architecture Guide
                            <i class="fas fa-chevron-down ml-1 text-xs"></i>
                        </a>
                        <div class="dropdown-menu">
                            <a href="architecture/index.html">Overview</a>
                            <a href="architecture/single-agent.html">Single-Agent Characteristics</a>
                            <a href="architecture/multi-agent.html">Multi-Agent Characteristics</a>
                            <a href="architecture/environment.html">Environment</a>
                        </div>
                    </div>
                    <a href="results.html" class="text-gray-600 hover:text-indigo-600">Results</a>
                    <a href="mafbench.html" class="text-gray-900 font-medium hover:text-indigo-600">MAFBench</a>
                    <a href="paper.html" class="text-gray-600 hover:text-indigo-600">Paper & Code</a>
                    <a href="about.html" class="text-gray-600 hover:text-indigo-600">About</a>
                </div>
                <button id="mobile-menu-btn" class="md:hidden text-gray-600">
                    <i class="fas fa-bars text-xl"></i>
                </button>
            </div>
        </div>
        <div id="mobile-menu" class="hidden md:hidden bg-white border-t">
            <div class="px-4 py-4 space-y-3">
                <a href="index.html" class="block text-gray-600">Home</a>
                <a href="design.html" class="block text-gray-600">Why Design Matters</a>
                <div class="pl-2 border-l-2 border-gray-200">
                    <a href="architecture/index.html" class="block text-gray-900 font-medium mb-2">Architecture Guide</a>
                    <div class="pl-4 space-y-2">
                        <a href="architecture/index.html" class="block text-gray-600 text-sm">Overview</a>
                        <a href="architecture/single-agent.html" class="block text-gray-600 text-sm">Single-Agent Characteristics</a>
                        <a href="architecture/multi-agent.html" class="block text-gray-600 text-sm">Multi-Agent Characteristics</a>
                        <a href="architecture/environment.html" class="block text-gray-600 text-sm">Environment</a>
                    </div>
                </div>
                <a href="results.html" class="block text-gray-600">Results</a>
                <a href="mafbench.html" class="block text-gray-900 font-medium">MAFBench</a>
                <a href="paper.html" class="block text-gray-600">Paper & Code</a>
                <a href="about.html" class="block text-gray-600">About</a>
            </div>
        </div>
    </nav>

    <!-- Hero Section -->
    <section class="pt-32 pb-16 px-4 sm:px-6 lg:px-8 bg-gradient-to-b from-indigo-50 to-white">
        <div class="max-w-4xl mx-auto text-center">
            <h1 class="text-5xl md:text-6xl font-bold text-gray-900 mb-6">
                MAFBench
            </h1>
            <p class="text-xl text-gray-600 mb-8">
                A unified benchmark for evaluating multi-agent LLM frameworks
            </p>
            <div class="flex flex-col sm:flex-row gap-4 justify-center">
                <a href="https://github.com/CoDS-GCS/MAFBench" target="_blank" class="px-8 py-4 bg-indigo-600 text-white rounded-lg font-semibold text-lg hover:bg-indigo-700 transition shadow-lg">
                    <i class="fab fa-github mr-2"></i> GitHub Repo
                </a>
                <a href="paper.html" class="px-8 py-4 bg-white text-indigo-600 border-2 border-indigo-600 rounded-lg font-semibold text-lg hover:bg-indigo-50 transition">
                    <i class="fas fa-book mr-2"></i> Documentation
                </a>
            </div>
        </div>
    </section>

    <!-- Benchmarks -->
    <section class="py-20 px-4 sm:px-6 lg:px-8 bg-white border-b border-gray-200">
        <div class="max-w-6xl mx-auto">
            <div class="bg-blue-50 border-l-4 border-blue-400 p-4 mb-8 rounded">
                <p class="text-sm text-gray-700">
                    <i class="fas fa-info-circle text-blue-600 mr-2"></i>
                    <strong>Note:</strong> All numerical values and statistics on this page should be verified against the paper PDF for the most accurate and up-to-date results.
                </p>
            </div>
            <h2 class="text-4xl font-bold text-center text-gray-900 mb-12">Benchmarks</h2>
            <p class="text-lg text-gray-700 mb-8 leading-relaxed text-center max-w-3xl mx-auto">
                MAFBench integrates established benchmarks to evaluate multi-agent frameworks across key architectural dimensions. Metrics follow original benchmark definitions (Acc.=Accuracy, F1=F1-score, R@5=Recall@5).
            </p>
            <div class="bg-gray-50 border border-gray-300 rounded-lg overflow-hidden">
                <table class="w-full text-sm">
                    <thead class="bg-gray-100">
                        <tr>
                            <th class="px-6 py-3 text-left font-semibold text-gray-900 border-b border-gray-300">Architectural Dimension</th>
                            <th class="px-6 py-3 text-left font-semibold text-gray-900 border-b border-gray-300">Benchmark</th>
                            <th class="px-6 py-3 text-left font-semibold text-gray-900 border-b border-gray-300">Metrics</th>
                        </tr>
                    </thead>
                    <tbody class="divide-y divide-gray-200">
                        <tr>
                            <td class="px-6 py-4 text-gray-900 font-medium">Orchestration overhead</td>
                            <td class="px-6 py-4 text-gray-700">Trivial query</td>
                            <td class="px-6 py-4 text-gray-700">Latency, throughput, tokens</td>
                        </tr>
                        <tr>
                            <td class="px-6 py-4 text-gray-900 font-medium">Memory architecture</td>
                            <td class="px-6 py-4 text-gray-700"><a href="https://huggingface.co/datasets/ai-hyz/MemoryAgentBench" target="_blank" class="text-indigo-600 hover:underline">MemoryAgentBench</a></td>
                            <td class="px-6 py-4 text-gray-700">Acc./F1/R@5</td>
                        </tr>
                        <tr>
                            <td class="px-6 py-4 text-gray-900 font-medium">Planning interface</td>
                            <td class="px-6 py-4 text-gray-700">
                                <a href="https://huggingface.co/datasets/openai/gsm8k" target="_blank" class="text-indigo-600 hover:underline">GSM8K</a>,
                                <a href="https://huggingface.co/datasets/tau/commonsense_qa" target="_blank" class="text-indigo-600 hover:underline">CSQA</a>,
                                <a href="https://github.com/hendrycks/math" target="_blank" class="text-indigo-600 hover:underline">MATH</a>
                            </td>
                            <td class="px-6 py-4 text-gray-700">Accuracy, failures, runtime</td>
                        </tr>
                        <tr>
                            <td class="px-6 py-4 text-gray-900 font-medium">Specialization conditioning</td>
                            <td class="px-6 py-4 text-gray-700"><a href="https://fathollahzadeh.github.io/papers/CatDB_VLDB2025.pdf" target="_blank" class="text-indigo-600 hover:underline">CatDB tasks</a></td>
                            <td class="px-6 py-4 text-gray-700">Precision, recall, F1</td>
                        </tr>
                        <tr>
                            <td class="px-6 py-4 text-gray-900 font-medium">Coordination topology</td>
                            <td class="px-6 py-4 text-gray-700">
                                <a href="https://huggingface.co/datasets/disco-eth/AgentsNet" target="_blank" class="text-indigo-600 hover:underline">AGENTSNET</a>
                                (<a href="https://github.com/floriangroetschla/AgentsNet" target="_blank" class="text-indigo-600 hover:underline">Code</a>)
                            </td>
                            <td class="px-6 py-4 text-gray-700">Success, rounds, tokens, time</td>
                        </tr>
                    </tbody>
                </table>
            </div>
        </div>
    </section>

    <!-- MAFBench Contributions -->
    <section class="py-20 px-4 sm:px-6 lg:px-8 bg-gray-50 border-b border-gray-200">
        <div class="max-w-6xl mx-auto">
            <h2 class="text-4xl font-bold text-center text-gray-900 mb-12">MAFBench Contributions</h2>

            <div class="space-y-8">
                <div>
                    <h3 class="text-xl font-semibold text-gray-900 mb-3">Unified Execution Pipeline</h3>
                    <p class="text-gray-700 leading-relaxed">
                        MAFBench provides a standardized agent interface for session-level execution, centralizing configuration of model parameters, session limits, batching, and scoring. This ensures identical conditions across frameworks while isolating architectural effects.
                    </p>
                </div>

                <div>
                    <h3 class="text-xl font-semibold text-gray-900 mb-3">Semantic Evaluation</h3>
                    <p class="text-gray-700 leading-relaxed">
                        We replace string-based metrics with LLM-based semantic evaluation to handle diverse answer formats, enabling fair comparison across different framework output styles.
                    </p>
                </div>

                <div>
                    <h3 class="text-xl font-semibold text-gray-900 mb-3">Transparent Backend Routing</h3>
                    <p class="text-gray-700 leading-relaxed">
                        For large-scale long-context evaluation, MAFBench introduces transparent backend routing that redirects compatible API calls to alternative providers (e.g., Groq-hosted models), enabling lower-cost evaluation without modifying framework implementations.
                    </p>
                </div>

                <div>
                    <h3 class="text-xl font-semibold text-gray-900 mb-3">Standardized Logging and Reproducibility</h3>
                    <p class="text-gray-700 leading-relaxed">
                        Results are logged and aggregated using a shared schema that captures accuracy, runtime, and token usage. Model selection, planning modes, and run budgets are centrally configured to ensure reproducibility and cost control.
                    </p>
                </div>

                <div>
                    <h3 class="text-xl font-semibold text-gray-900 mb-3">Topology Rewriting Engine</h3>
                    <p class="text-gray-700 leading-relaxed">
                        For coordination evaluation, MAFBench implements a topology rewriting engine that transforms base communication graphs into sequential, hierarchical, and centralized structures while preserving agent sets, enabling systematic analysis of how interaction topology affects coordination dynamics.
                    </p>
                </div>

                <div>
                    <h3 class="text-xl font-semibold text-gray-900 mb-3">Isolated Architectural Impact</h3>
                    <p class="text-gray-700 leading-relaxed">
                        By fixing the underlying LLM model, prompts, and task structure, MAFBench attributes performance differences to framework architecture rather than model quality, revealing true architectural impact on memory, planning, specialization, and coordination.
                    </p>
                </div>
            </div>
        </div>
    </section>

    <!-- Get Started -->
    <section class="py-20 px-4 sm:px-6 lg:px-8 bg-indigo-600">
        <div class="max-w-4xl mx-auto text-center">
            <h2 class="text-4xl font-bold text-white mb-6">Ready to Run Your Own Tests?</h2>
            <p class="text-xl text-indigo-100 mb-8">
                MAFBench is open source and ready to use. Evaluate your framework or contribute to the benchmark.
            </p>
            <div class="flex flex-col sm:flex-row gap-4 justify-center">
                <a href="https://github.com/CoDS-GCS/MAFBench" target="_blank" class="px-8 py-4 bg-white text-indigo-600 rounded-lg font-semibold text-lg hover:bg-gray-100 transition shadow-lg">
                    <i class="fab fa-github mr-2"></i> View on GitHub
                </a>
                <a href="paper.html" class="px-8 py-4 bg-indigo-500 text-white rounded-lg font-semibold text-lg hover:bg-indigo-400 transition">
                    <i class="fas fa-file-pdf mr-2"></i> Read the Paper
                </a>
            </div>
        </div>
    </section>

    <!-- Footer -->
    <footer class="bg-gray-900 text-gray-300 py-12 px-4 sm:px-6 lg:px-8">
        <div class="max-w-7xl mx-auto">
            <div class="grid md:grid-cols-4 gap-8 mb-8">
                <div>
                    <h3 class="text-white font-bold text-lg mb-4">MAFBench</h3>
                    <p class="text-sm">A unified benchmark for evaluating multi-agent LLM frameworks.</p>
                </div>
                <div>
                    <h4 class="text-white font-semibold mb-4">Resources</h4>
                    <ul class="space-y-2 text-sm">
                        <li><a href="paper.html" class="hover:text-white">Paper</a></li>
                        <li><a href="mafbench.html" class="hover:text-white">Benchmark</a></li>
                        <li><a href="https://github.com/CoDS-GCS/MAFBench" target="_blank" class="hover:text-white">GitHub</a></li>
                    </ul>
                </div>
                <div>
                    <h4 class="text-white font-semibold mb-4">Learn</h4>
                    <ul class="space-y-2 text-sm">
                        <li><a href="architecture/index.html" class="hover:text-white">Architecture Guide</a></li>
                        <li><a href="results.html" class="hover:text-white">Results</a></li>
                    </ul>
                </div>
                <div>
                    <h4 class="text-white font-semibold mb-4">Connect</h4>
                    <ul class="space-y-2 text-sm">
                        <li><a href="about.html" class="hover:text-white">About</a></li>
                        <li><a href="https://github.com/CoDS-GCS/MAFBench" target="_blank" class="hover:text-white">GitHub</a></li>
                    </ul>
                </div>
            </div>
            <div class="border-t border-gray-800 pt-8 text-center text-sm">
                <p>&copy; 2026 MAFBench. Research by Concordia University.</p>
            </div>
        </div>
    </footer>

    <script src="scripts/main.js"></script>
</body>
</html>