Skip to content

Commit 17f4454

Browse files
committed
feat: refact validator to support dpo dataset
1 parent e0a7c86 commit 17f4454

18 files changed

Lines changed: 838 additions & 129 deletions

File tree

packages/cli/src/commands/dataset/upload.ts

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
detectOutputFormat,
44
uploadDataset,
55
validateDataset,
6+
parseDatasetSchemaFlag,
67
MAX_DATASET_BYTES,
78
BailianError,
89
ExitCode,
@@ -28,7 +29,8 @@ function formatIssue(issue: ValidationResult["errors"][number]): string {
2829
export default defineCommand({
2930
name: "dataset upload",
3031
description: "Upload a dataset file (.jsonl) to Bailian",
31-
usage: "bl dataset upload --file <path> [--purpose <name>] [--no-validate] [--full-validate]",
32+
usage:
33+
"bl dataset upload --file <path> [--purpose <name>] [--schema <chatml|dpo>] [--no-validate] [--full-validate]",
3234
options: [
3335
{
3436
flag: "--file <path>",
@@ -39,6 +41,11 @@ export default defineCommand({
3941
flag: "--purpose <name>",
4042
description: 'Dataset purpose tag (default: "fine-tune"; e.g. "evaluation")',
4143
},
44+
{
45+
flag: "--schema <s>",
46+
description:
47+
'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.',
48+
},
4249
{
4350
flag: "--no-validate",
4451
description: "Skip the local JSONL pre-flight check (not recommended)",
@@ -52,15 +59,19 @@ export default defineCommand({
5259
],
5360
examples: [
5461
"bl dataset upload --file train.jsonl",
62+
"bl dataset upload --file dpo.jsonl --schema dpo",
5563
"bl dataset upload --file eval.jsonl --purpose evaluation",
5664
"bl dataset upload --file train.jsonl --full-validate",
5765
"bl dataset upload --file train.jsonl --no-validate",
5866
],
5967
notes: [
60-
"Only .jsonl is supported in this release. The default validator expects a",
61-
'ChatML schema (each line a JSON object with a "messages" array). Other',
62-
"purposes may carry a different schema in the future and would be served",
63-
"by a purpose-specific validator at that point.",
68+
"Only .jsonl is supported in this release. Two record schemas are",
69+
"recognized: chatml = {messages:[...]} (SFT); dpo = {messages:[...],",
70+
"chosen, rejected} where chosen/rejected are single assistant messages.",
71+
"With no --schema, a record carrying chosen/rejected is validated as DPO;",
72+
"pass --schema dpo to require it on every record, or --schema chatml to",
73+
"ignore preference fields. Other purposes may carry a different schema in",
74+
"the future and would be served by a purpose-specific validator.",
6475
"The dataset upload cap is 300MB per file.",
6576
"Upload uses the OpenAI-compatible /compatible-mode/v1/files endpoint so",
6677
"the purpose tag is persisted (the DashScope-native /api/v1/files drops it).",
@@ -72,10 +83,11 @@ export default defineCommand({
7283
const purpose = (flags.purpose as string | undefined) || "fine-tune";
7384
const skipValidate = Boolean(flags.noValidate);
7485
const fullValidate = Boolean(flags.fullValidate);
86+
const schema = parseDatasetSchemaFlag(flags.schema as string | undefined);
7587
const format = detectOutputFormat(config.output);
7688

7789
if (!skipValidate) {
78-
const result = await validateDataset(filePath!, { fullValidate });
90+
const result = await validateDataset(filePath!, { fullValidate, schema });
7991
if (!result.valid) {
8092
const lines = [
8193
`Dataset validation failed for ${filePath}`,
@@ -112,6 +124,7 @@ export default defineCommand({
112124
purpose,
113125
max_bytes: MAX_DATASET_BYTES,
114126
validate: !skipValidate,
127+
schema: schema ?? "auto",
115128
},
116129
format,
117130
);

packages/cli/src/commands/dataset/validate.ts

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import {
22
defineCommand,
33
detectOutputFormat,
44
validateDataset,
5+
parseDatasetSchemaFlag,
56
BailianError,
67
ExitCode,
78
type Config,
@@ -32,39 +33,58 @@ function formatStats(r: ValidationResult): string[] {
3233
export default defineCommand({
3334
name: "dataset validate",
3435
description: "Locally validate a dataset file (.jsonl) without uploading",
35-
usage: "bl dataset validate --file <path> [--full-validate]",
36+
usage: "bl dataset validate --file <path> [--full-validate] [--schema <chatml|dpo>]",
3637
options: [
3738
{ flag: "--file <path>", description: "Local .jsonl dataset file", required: true },
3839
{
3940
flag: "--full-validate",
4041
description: "JSON.parse every line instead of sampling (slower)",
4142
type: "boolean",
4243
},
44+
{
45+
flag: "--schema <s>",
46+
description:
47+
'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.',
48+
},
4349
],
4450
examples: [
4551
"bl dataset validate --file train.jsonl",
52+
"bl dataset validate --file dpo.jsonl --schema dpo",
4653
"bl dataset validate --file eval.jsonl --full-validate",
4754
"bl dataset validate --file train.jsonl --output json",
4855
],
4956
notes: [
5057
"Default scan: every line gets a structural check, then ~160 lines (front 50,",
5158
"evenly spaced 100, last 10) are JSON.parsed against the active schema.",
52-
"Today the only registered .jsonl schema is ChatML (messages array).",
59+
"Schemas: chatml = {messages:[...]} (SFT); dpo = {messages:[...], chosen,",
60+
"rejected} where chosen/rejected are single assistant messages. With no",
61+
"--schema, a record carrying chosen/rejected is validated as DPO; pass",
62+
"--schema dpo to require chosen/rejected on every record (strict), or",
63+
"--schema chatml to ignore preference fields.",
5364
"Use --full-validate to JSON.parse every line.",
5465
],
5566
async run(config: Config, flags: GlobalFlags) {
5667
const filePath = flags.file as string | undefined;
5768
if (!filePath) failIfMissing("file", "bl dataset validate --file <path>");
5869

5970
const fullValidate = Boolean(flags.fullValidate);
71+
const schema = parseDatasetSchemaFlag(flags.schema as string | undefined);
6072
const format = detectOutputFormat(config.output);
6173

6274
if (config.dryRun) {
63-
emitResult({ action: "dataset.validate", file: filePath, full: fullValidate }, format);
75+
emitResult(
76+
{
77+
action: "dataset.validate",
78+
file: filePath,
79+
full: fullValidate,
80+
schema: schema ?? "auto",
81+
},
82+
format,
83+
);
6484
return;
6585
}
6686

67-
const result = await validateDataset(filePath!, { fullValidate });
87+
const result = await validateDataset(filePath!, { fullValidate, schema });
6888

6989
if (format === "json") {
7090
// For json output we always emit the structured result, exit code conveys validity.

0 commit comments

Comments
 (0)