From 0e20ae593dca6b28e0ec868a2e6452b2f5293278 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Mon, 8 Jun 2026 21:32:17 +0000 Subject: [PATCH 1/2] agent: docs: define public benchmark methodology --- .../PUBLIC-BENCHMARK-METHODOLOGY.md | 221 ++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 docs/benchmarks/PUBLIC-BENCHMARK-METHODOLOGY.md diff --git a/docs/benchmarks/PUBLIC-BENCHMARK-METHODOLOGY.md b/docs/benchmarks/PUBLIC-BENCHMARK-METHODOLOGY.md new file mode 100644 index 0000000..7265e40 --- /dev/null +++ b/docs/benchmarks/PUBLIC-BENCHMARK-METHODOLOGY.md @@ -0,0 +1,221 @@ +# Public Benchmark Methodology + +**Status:** Draft methodology for issue +[#63](https://github.com/ayhammouda/python-docs-mcp-server/issues/63). +Do not publish comparative claims until the harness has produced reproducible +data from this methodology. + +## Purpose + +The v0.5.0 public benchmark compares `python-docs-mcp-server` with eligible +Python documentation MCPs and a no-MCP baseline on a fixed 50-question Python +documentation evaluation. The benchmark reports correctness, token cost, and +latency, with enough detail for a clean clone to reproduce the run. + +This is an evidence artifact, not marketing copy. If the data is boring or +unfavorable, publish the data honestly and adjust the product claims. + +## Evidence Flow + +```mermaid +flowchart LR + Corpus[50-question corpus] --> Runner[Benchmark runner] + Competitors[Pinned MCP competitors] --> Runner + Baseline[No-MCP baseline] --> Runner + Runner --> Transcripts[Raw transcripts] + Transcripts --> Tokens[Token counting after client rewrap] + Transcripts --> Rubric[Correctness scoring] + Transcripts --> Latency[Latency summary] + Tokens --> Report[Public report] + Rubric --> Report + Latency --> Report +``` + +## Systems Under Test + +The final competitor set is locked at execution time. A competitor is eligible +only if all of these are true: + +- It exposes Python standard library documentation retrieval or search. +- It can be run or queried reproducibly from a clean clone. +- Its version, package, image, endpoint, or commit can be pinned. +- Its terms allow benchmark use. +- It does not require private, undocumented access. + +The initial candidate matrix is: + +- `python-docs-mcp-server` +- Context7 +- GitMCP +- DeepWiki +- Ref.tools +- no-MCP baseline + +The no-MCP baseline uses the same model and question prompts, but no retrieved +documentation context. It measures parametric model behavior, not another docs +tool. + +## Corpus Design + +The corpus contains exactly 50 questions. Each question must include: + +- Stable question ID. +- Category. +- Python version or version pair. +- Prompt shown to the model. +- Official-docs answer key. +- Required citations or source sections. +- Expected answer properties. +- Known ambiguity notes, if any. + +Distribution: + +- 15 exact-symbol questions. +- 10 concept or API-usage questions. +- 15 cross-version questions, led by `compare_versions`-style diffs. +- 5 PEP-adjacent questions where the official stdlib docs or "What's New" + pages contain the required answer. +- 5 applied questions that require selecting the right stdlib API from the + documentation. + +The corpus must avoid questions whose answer requires private knowledge, +external package documentation, non-stdlib behavior, or unreleased CPython +changes. + +## Source Of Truth + +Correctness is scored against official Python documentation generated from +CPython source at pinned tags. When a question concerns version behavior, the +answer key must cite the exact relevant version or version pair. + +Allowed truth sources: + +- CPython documentation source at pinned commit or tag. +- Generated official docs for the same Python version. +- Official "What's New" pages for PEP-adjacent behavior. + +Disallowed truth sources: + +- Blog posts. +- Search snippets. +- LLM-generated explanations. +- Third-party mirrors unless used only as a convenience link and verified + against CPython source. + +## Prompting Rules + +Every system under test receives the same user question. The only allowed +difference is the documentation context supplied by that system. + +The model prompt must require: + +- A concise answer. +- Version-specific wording when the question names a version. +- No unsupported claims. +- A short citation to the retrieved section when the system provides one. + +The prompt must not reveal the answer key, rubric, or expected winning system. + +## Correctness Scoring + +Each answer receives one score: + +- `1.0`: Correct, version-aware, and includes all required answer properties. +- `0.5`: Partially correct, but misses a required nuance, version condition, or + citation. +- `0.0`: Incorrect, unsupported, materially incomplete, or answers the wrong + version. + +For public reporting, include both: + +- Mean correctness score. +- Per-category correctness score. + +Any answer that appears correct but lacks evidence from the supplied docs is +marked in the raw results and discussed separately. The benchmark should reward +grounded answers, not confident autocomplete. + +## Token Measurement + +Token methodology follows roadmap decision 5.8 and ADR-006: + +- Use Claude token counting as the primary metric. +- Measure after client-side rewrap, not only raw MCP payload bytes. +- Record raw payload tokens separately as diagnostic data. +- Report serialization latency alongside token counts. + +Primary token count: + +1. Capture the MCP tool response or baseline prompt context. +2. Pass it through the same client-side wrapping path used by the benchmark + client. +3. Count the resulting message envelope with Claude token counting. + +If a client cannot expose its exact wrapped message envelope, the report must +say so and mark that result as an approximation. Approximate counts must not be +used for headline claims. + +## Latency Measurement + +Latency is wall-clock time measured per question from request dispatch to final +answer receipt. + +Report: + +- Median. +- p95. +- Minimum and maximum. +- Cold-run and warm-run separation where the system has a cache or index. + +Index build time is not part of per-query latency. It may be reported as setup +cost in a separate section. + +## Reproducibility + +The public harness must run from a clean clone with one command after dependency +installation. It must write: + +- Competitor manifest with pinned versions. +- Corpus file. +- Raw transcripts. +- Raw scoring records. +- Token-count records. +- Latency records. +- Summary report. + +Result files must include enough metadata to rerun or audit them: + +- Repository commit. +- Python version. +- Operating system. +- Model name and provider. +- MCP client or adapter version. +- Competitor versions. +- Timestamp. + +## Honesty Rules + +- No comparative claim enters README, PyPI, launch copy, or social posts before + public results exist. +- Do not drop failed systems silently. If a competitor cannot run, report the + failure reason and exclude it from scored comparisons. +- Do not change the corpus after seeing results unless the change is documented + and the whole benchmark is rerun. +- Do not tune prompts per competitor. +- Do not report approximate token counts as exact. + +## Harness Work Packages + +Once this methodology is accepted, the implementation can be split into smaller +agent-ready issues: + +1. Corpus schema and fixture loader. +2. Baseline runner and transcript format. +3. `python-docs-mcp-server` runner. +4. Competitor manifest and adapter skeletons. +5. Correctness scorer with manual-adjudication hooks. +6. Claude token-count integration after client rewrap. +7. Latency recorder and report generator. + +Each work package should reference this methodology and use `Refs #63`, not +`Closes #63`, until the full benchmark has produced public data. From 50c40b2437979d923a0db0828c16c39f1f483003 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Mon, 8 Jun 2026 21:35:05 +0000 Subject: [PATCH 2/2] agent: deps: refresh pyjwt lock --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 5a2a031..bbbc0cd 100644 --- a/uv.lock +++ b/uv.lock @@ -475,11 +475,11 @@ wheels = [ [[package]] name = "pyjwt" -version = "2.12.1" +version = "2.13.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c2/27/a3b6e5bf6ff856d2509292e95c8f57f0df7017cf5394921fc4e4ef40308a/pyjwt-2.12.1.tar.gz", hash = "sha256:c74a7a2adf861c04d002db713dd85f84beb242228e671280bf709d765b03672b", size = 102564, upload-time = "2026-03-13T19:27:37.25Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/81/58d0ac84e1ef3a3843791d6954d94c0b33d526c75eeb1efbce9d0a4c4077/pyjwt-2.13.0.tar.gz", hash = "sha256:41571c89ca91598c79e8ef18a2d07367d4810fbbd6f637794879baf1b7703423", size = 107515, upload-time = "2026-05-21T19:54:36.618Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/7a/8dd906bd22e79e47397a61742927f6747fe93242ef86645ee9092e610244/pyjwt-2.12.1-py3-none-any.whl", hash = "sha256:28ca37c070cad8ba8cd9790cd940535d40274d22f80ab87f3ac6a713e6e8454c", size = 29726, upload-time = "2026-03-13T19:27:35.677Z" }, + { url = "https://files.pythonhosted.org/packages/a3/5e/ecf12fdb62546d64385c158514e9b2b671f7832108ef2ecd2020ce0af2d1/pyjwt-2.13.0-py3-none-any.whl", hash = "sha256:66adcc2aff09b3f1bbd95fc1e1577df8ac8723c978552fd43304c8a290ac5728", size = 31274, upload-time = "2026-05-21T19:54:35.362Z" }, ] [package.optional-dependencies]