Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![CI](https://github.com/NVIDIA-NeMo/DataDesigner/actions/workflows/ci.yml/badge.svg)](https://github.com/NVIDIA-NeMo/DataDesigner/actions/workflows/ci.yml)
[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![Python 3.10 - 3.14](https://img.shields.io/badge/🐍_Python-3.10_|_3.11_|_3.12_|_3.13_|_3.14-blue.svg)](https://www.python.org/downloads/) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html) [![Code](https://img.shields.io/badge/Code-Documentation-8A2BE2.svg)](https://nvidia-nemo.github.io/DataDesigner/) ![Tokens](https://img.shields.io/badge/400+_Billion-Tokens_Generated-76b900.svg?logo=nvidia&logoColor=white)
[![Python 3.10 - 3.14](https://img.shields.io/badge/🐍_Python-3.10_|_3.11_|_3.12_|_3.13_|_3.14-blue.svg)](https://www.python.org/downloads/) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html) [![Code](https://img.shields.io/badge/Code-Documentation-8A2BE2.svg)](https://nvidia-nemo.github.io/DataDesigner/) ![Tokens](https://img.shields.io/badge/2.6T+-Tokens_Processed-76b900.svg?logo=nvidia&logoColor=white)

**Generate high-quality synthetic datasets from scratch or using your own seed data.**

Expand Down Expand Up @@ -153,11 +153,11 @@ Disable with `NEMO_TELEMETRY_ENABLED=false`. **[More details →](#telemetry-and

### Top models (YTD)

Aggregate model usage across synthetic data generation jobs, year-to-date 1/1/2026–5/1/2026:
Aggregate model usage across synthetic data generation jobs, year-to-date 1/1/2026–6/1/2026:

![Top models used for synthetic data generation](docs/images/top-models.png)

_Last updated on May 1, 2026_
_Last updated on June 1, 2026_

---

Expand Down
Binary file modified docs/images/top-models.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
284 changes: 284 additions & 0 deletions docs/scripts/generate_top_models_figure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "matplotlib==3.9.4",
# ]
# ///
"""Regenerate the "Top Model Usage" telemetry figure.

Renders the ranked input-vs-output token breakdown shown in the README's
"Top models (YTD)" section, styled to match the Data Designer devnote charts
(near-black canvas, NVIDIA-green duotone). The same PNG is written to both
tracked copies so the README and Fern docs site stay in sync:

docs/images/top-models.png (rendered by the README)
fern/images/top-models.png (Fern's /images/* mirror)

The source telemetry export lives at docs/scripts/top-model-usage.csv with
columns: model name, input (context) tokens, output (generated) tokens, plus a
trailing "Other" aggregate row. Drop in a fresh export to refresh the figure.

Run:
# Regenerate from the committed CSV (zero args)
uv run docs/scripts/generate_top_models_figure.py

# Refresh from a new telemetry export
uv run docs/scripts/generate_top_models_figure.py --csv ~/Downloads/new-export.csv

# Options
uv run docs/scripts/generate_top_models_figure.py --help
"""

from __future__ import annotations

import argparse
import csv
import shutil
from pathlib import Path

import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.ticker import FuncFormatter, MaxNLocator

# Repo root is two levels up from docs/scripts/.
REPO_ROOT = Path(__file__).resolve().parents[2]
DEFAULT_CSV = REPO_ROOT / "docs" / "scripts" / "top-model-usage.csv"
# Tracked copies of the figure; first entry is the canonical render target.
# docs/images/ is what the README renders; fern/images/ is Fern's mirror for
# /images/* references.
TARGETS = (
REPO_ROOT / "docs" / "images" / "top-models.png",
REPO_ROOT / "fern" / "images" / "top-models.png",
)
Comment thread
eric-tramel marked this conversation as resolved.

# ---------------------------------------------------------------- palette ----
BG = "#0E0E0E" # near-black canvas (matches DD devnote charts)
GREEN = "#76B900" # NVIDIA green -> input (context) tokens
LIME = "#C5E86C" # light NVIDIA-tint green -> output (generated) tokens
WHITE = "#FFFFFF"
SUBTLE = "#9A9A9A"
AXIS = "#B8B8B8"
MODELNAME = "#ECECEC"
GRID = "#FFFFFF"
SPINE = "#4A4A4A"
INK = "#0E0E0E" # dark ink for labels sitting on bright bars

B = 1e9 # render token counts in billions


def load_rows(csv_path: Path) -> list[tuple[str, float, float]]:
"""Parse the telemetry CSV into (name, input_tokens, output_tokens) rows."""
rows: list[tuple[str, float, float]] = []
with csv_path.open(newline="", encoding="utf-8-sig") as fh:
reader = csv.reader(fh)
next(reader) # header
for name, inp, out in reader:
rows.append((name, float(inp.replace(",", "")), float(out.replace(",", ""))))
return rows


def configure_matplotlib() -> None:
"""Pin rendering to deterministic settings so the asset is reproducible.

Forces the Agg backend and matplotlib's bundled DejaVu Sans face rather than
opportunistically selecting a system Helvetica/Arial. Combined with the
pinned matplotlib version in the script metadata, this keeps the checked-in
PNG byte-reproducible across machines and CI.
"""
plt.switch_backend("Agg")
rcParams["font.family"] = "DejaVu Sans"
rcParams["font.size"] = 13


def fmt(v: float) -> str:
"""Compact billions/trillions label."""
if v >= 1e12:
return f"{v / 1e12:.2f}T"
return f"{v / 1e9:.0f}B"


def render(rows: list[tuple[str, float, float]], out_path: Path) -> None:
"""Render the ranked stacked-bar figure to out_path."""
# Split the "Other" aggregate out; sort named models by total descending.
other = next((r for r in rows if r[0].lower() == "other"), None)
models = [r for r in rows if r[0].lower() != "other"]
models.sort(key=lambda r: r[1] + r[2], reverse=True)

n = len(models)
ypos = list(range(n, 0, -1)) # n, n-1, ... 1 (top -> down)
labels = [m[0] for m in models]
inputs = [m[1] for m in models]
outputs = [m[2] for m in models]

if other is not None:
ypos.append(-0.6) # gap below the named models
labels.append("Other models")
inputs.append(other[1])
outputs.append(other[2])

fig, ax = plt.subplots(figsize=(14.5, 9.2), dpi=200)
fig.patch.set_facecolor(BG)
ax.set_facecolor(BG)

xmax = max(i + o for i, o in zip(inputs, outputs)) / B
bar_h = 0.62

for idx, (y, inp, out) in enumerate(zip(ypos, inputs, outputs)):
is_other = other is not None and idx == len(ypos) - 1
a = 0.45 if is_other else 1.0

ax.barh(y, inp / B, height=bar_h, color=GREEN, alpha=a, zorder=3, edgecolor=BG, linewidth=1.2)
ax.barh(y, out / B, height=bar_h, left=inp / B, color=LIME, alpha=a, zorder=3, edgecolor=BG, linewidth=1.2)

total = (inp + out) / B
ax.text(
total + xmax * 0.008,
y,
fmt(inp + out),
va="center",
ha="left",
color=SUBTLE if is_other else WHITE,
fontweight="bold",
fontsize=13.5,
zorder=5,
)

# In-segment value labels only where the segment is wide enough.
if inp / B > xmax * 0.085:
ax.text(
(inp / B) / 2,
y,
fmt(inp),
va="center",
ha="center",
color=INK,
fontweight="bold",
fontsize=11.5,
alpha=a,
zorder=5,
)
if out / B > xmax * 0.085:
ax.text(
inp / B + (out / B) / 2,
y,
fmt(out),
va="center",
ha="center",
color=INK,
fontweight="bold",
fontsize=11.5,
alpha=a,
zorder=5,
)

# ------------------------------------------------------------- axes -----
ax.set_yticks(ypos)
ax.set_yticklabels(labels, fontsize=12.5)
is_other_flags = [False] * n + ([True] if other else [])
for tick, is_other in zip(ax.get_yticklabels(), is_other_flags):
tick.set_color(SUBTLE if is_other else MODELNAME)
if is_other:
tick.set_fontstyle("italic")

ax.set_xlim(0, xmax * 1.13)
ax.set_ylim(-1.3, n + 0.8)

# Derive ticks from the data so the axis stays sane as totals grow; fmt()
# promotes B -> T automatically, so the labels never need hand-editing.
ax.xaxis.set_major_locator(MaxNLocator(nbins=8, steps=[1, 2, 2.5, 5, 10]))
ax.xaxis.set_major_formatter(FuncFormatter(lambda v, _pos: "0" if v <= 0 else fmt(v * B)))
ax.tick_params(axis="y", length=0, pad=10)
ax.tick_params(axis="x", colors=AXIS, length=0, pad=8, labelsize=11)
ax.set_xlabel("Tokens processed", color=AXIS, fontsize=12.5, labelpad=12)

ax.xaxis.grid(True, color=GRID, alpha=0.07, linewidth=1, zorder=0)
ax.set_axisbelow(True)
for s in ("top", "right"):
ax.spines[s].set_visible(False)
for s in ("bottom", "left"):
ax.spines[s].set_color(SPINE)
ax.spines[s].set_linewidth(1.0)

# ---------------------------------------------------------- titling -----
fig.subplots_adjust(left=0.235, right=0.965, top=0.83, bottom=0.085)
# Signature DD green left-accent rule (mirrors the .devnote-dek element).
ax.add_patch(
plt.Rectangle(
(-0.018, 1.045),
0.006,
0.135,
transform=ax.transAxes,
facecolor=GREEN,
edgecolor="none",
clip_on=False,
zorder=6,
)
)
ax.text(
0.012,
1.145,
"Top Model Usage",
transform=ax.transAxes,
color=WHITE,
fontweight="bold",
fontsize=26,
ha="left",
va="bottom",
)
ax.text(
0.012,
1.07,
"Context vs. generated tokens across the most-used models",
transform=ax.transAxes,
color=SUBTLE,
fontsize=13.5,
ha="left",
va="bottom",
)

# Manual legend, top-right of the plotting area.
leg_x, leg_y = 0.99, 1.115
legend = [(GREEN, "Input · context tokens"), (LIME, "Output · generated tokens")]
for i, (c, lbl) in enumerate(legend):
yy = leg_y - i * 0.052
ax.add_patch(
plt.Rectangle(
(leg_x - 0.205, yy - 0.012),
0.022,
0.026,
transform=ax.transAxes,
facecolor=c,
edgecolor="none",
clip_on=False,
zorder=6,
)
)
ax.text(leg_x - 0.172, yy, lbl, transform=ax.transAxes, color=MODELNAME, fontsize=12, ha="left", va="center")

out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, facecolor=BG, dpi=200, bbox_inches="tight", pad_inches=0.25)
plt.close(fig)


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
parser.add_argument("--csv", type=Path, default=DEFAULT_CSV, help=f"Telemetry export CSV (default: {DEFAULT_CSV})")
args = parser.parse_args()

configure_matplotlib()
rows = load_rows(args.csv)

primary, *mirrors = TARGETS
render(rows, primary)
for mirror in mirrors:
mirror.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(primary, mirror)

for target in TARGETS:
print(f"wrote {target.relative_to(REPO_ROOT)}")


if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions docs/scripts/top-model-usage.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"Top 10 Model Usage","Input Tokens (Context)","Output Tokens (Generated)"
"openai/gpt-oss-120b","581,991,035,603","69,823,305,523"
"google/gemma-4-31B-it","305,097,721,372","139,909,403,045"
"Qwen/Qwen3-VL-235B-A22B-Instruct","252,299,362,661","2,506,282,983"
"Qwen/Qwen3.5-397B-A17B-FP8","185,392,972,434","72,214,577,833"
"google/gemma-4-26B-A4B-it","112,014,037,550","16,872,099,656"
"Qwen/Qwen3.5-122B-A10B","87,216,522,178","41,888,115,144"
"gcp/google/gemini-3.1-flash-lite-preview","61,793,069,244","7,206,950,344"
"Qwen/Qwen3-VL-235B-A22B-Thinking-FP8","52,889,942,762","9,031,174,934"
"Qwen/Qwen3.6-35B-A3B","46,115,903,437","4,269,353,359"
"Qwen/Qwen3-VL-30B-A3B-Thinking","42,718,861,428","7,201,483,397"
Other,"394,226,701,751","189,813,318,234"
Binary file removed fern/assets/images/top-models.png
Binary file not shown.
Binary file modified fern/images/top-models.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading