Skip to content

Commit 18d5c42

Browse files
committed
feat: add cpt dataset type
1 parent 9ad85b6 commit 18d5c42

12 files changed

Lines changed: 245 additions & 58 deletions

File tree

packages/cli/src/commands/dataset/upload.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ export default defineCommand({
1919
name: "dataset upload",
2020
description: "Upload a dataset file (.jsonl) to Bailian",
2121
usage:
22-
"bl dataset upload --file <path> [--purpose <name>] [--schema <chatml|dpo>] [--no-validate] [--full-validate]",
22+
"bl dataset upload --file <path> [--purpose <name>] [--schema <chatml|dpo|cpt>] [--no-validate] [--full-validate]",
2323
options: [
2424
{
2525
flag: "--file <path>",
@@ -33,7 +33,7 @@ export default defineCommand({
3333
{
3434
flag: "--schema <s>",
3535
description:
36-
'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.',
36+
'Record schema: "chatml" (SFT), "dpo" (chosen/rejected), or "cpt" (raw text). Default auto-detects per record.',
3737
},
3838
{
3939
flag: "--no-validate",
@@ -49,17 +49,20 @@ export default defineCommand({
4949
examples: [
5050
"bl dataset upload --file train.jsonl",
5151
"bl dataset upload --file dpo.jsonl --schema dpo",
52+
"bl dataset upload --file cpt.jsonl --schema cpt",
5253
"bl dataset upload --file eval.jsonl --purpose evaluation",
5354
"bl dataset upload --file train.jsonl --full-validate",
5455
"bl dataset upload --file train.jsonl --no-validate",
5556
],
5657
notes: [
57-
"Only .jsonl is supported in this release. Two record schemas are",
58+
"Only .jsonl is supported in this release. Three record schemas are",
5859
"recognized: chatml = {messages:[...]} (SFT); dpo = {messages:[...],",
59-
"chosen, rejected} where chosen/rejected are single assistant messages.",
60-
"With no --schema, a record carrying chosen/rejected is validated as DPO;",
61-
"pass --schema dpo to require it on every record, or --schema chatml to",
62-
"ignore preference fields. Other purposes may carry a different schema in",
60+
"chosen, rejected} where chosen/rejected are single assistant messages;",
61+
'cpt = {text:"..."} (continual pre-training, raw text). With no --schema,',
62+
"a record carrying chosen/rejected is validated as DPO, one with text (and",
63+
"no messages) as CPT, otherwise as ChatML. Pass --schema dpo / cpt to",
64+
"require that shape on every record, or --schema chatml to ignore the",
65+
"preference / text fields. Other purposes may carry a different schema in",
6366
"the future and would be served by a purpose-specific validator.",
6467
"The dataset upload cap is 300MB per file.",
6568
"Upload uses the OpenAI-compatible /compatible-mode/v1/files endpoint so",

packages/cli/src/commands/dataset/validate.ts

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ export default defineCommand({
2828
description: "Locally validate a dataset file (.jsonl) without uploading",
2929
// 纯本地校验,不触网、不需 API key(与 `pipeline validate` 一致)。
3030
skipDefaultApiKeySetup: true,
31-
usage: "bl dataset validate --file <path> [--full-validate] [--schema <chatml|dpo>]",
31+
usage: "bl dataset validate --file <path> [--full-validate] [--schema <chatml|dpo|cpt>]",
3232
options: [
3333
{ flag: "--file <path>", description: "Local .jsonl dataset file", required: true },
3434
{
@@ -39,24 +39,26 @@ export default defineCommand({
3939
{
4040
flag: "--schema <s>",
4141
description:
42-
'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.',
42+
'Record schema: "chatml" (SFT), "dpo" (chosen/rejected), or "cpt" (raw text). Default auto-detects per record.',
4343
},
4444
],
4545
examples: [
4646
"bl dataset validate --file train.jsonl",
4747
"bl dataset validate --file dpo.jsonl --schema dpo",
48+
"bl dataset validate --file cpt.jsonl --schema cpt",
4849
"bl dataset validate --file eval.jsonl --full-validate",
4950
"bl dataset validate --file train.jsonl --output json",
5051
],
5152
notes: [
5253
"Default scan: every line gets a structural check, then ~160 lines (front 50,",
5354
"evenly spaced 100, last 10) are JSON.parsed against the active schema.",
5455
"Schemas: chatml = {messages:[...]} (SFT); dpo = {messages:[...], chosen,",
55-
"rejected} where chosen/rejected are single assistant messages. With no",
56-
"--schema, a record carrying chosen/rejected is validated as DPO; pass",
57-
"--schema dpo to require chosen/rejected on every record (strict), or",
58-
"--schema chatml to ignore preference fields.",
59-
"Use --full-validate to JSON.parse every line.",
56+
"rejected} where chosen/rejected are single assistant messages; cpt =",
57+
'{text:"..."} (continual pre-training, raw text). With no --schema, a',
58+
"record carrying chosen/rejected is validated as DPO, one with text (and no",
59+
"messages) as CPT, otherwise as ChatML. Pass --schema dpo / cpt to require",
60+
"that shape on every record (strict), or --schema chatml to ignore the",
61+
"preference / text fields. Use --full-validate to JSON.parse every line.",
6062
],
6163
async run(config: Config, flags: GlobalFlags) {
6264
const filePath = flags.file as string | undefined;

packages/cli/src/commands/finetune/create.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ export default defineCommand({
275275
"--datasets / --validations accept either file-ids (from `bl dataset",
276276
"upload`) or local .jsonl paths. Local paths are validated and uploaded",
277277
"first, then their file-ids are submitted — a one-step upload-and-train.",
278+
"Dataset record schema is chosen from --training-type: dpo* → {messages,",
279+
"chosen, rejected}; cpt → {text} (raw pre-training text); else {messages}.",
278280
"Pre-submit gate: if the training dataset's sample count is not greater",
279281
"than batch_size, the job is rejected before upload or quota consumption",
280282
"(the platform would otherwise fail ~10 min in, after data processing).",
@@ -298,8 +300,13 @@ export default defineCommand({
298300
`Supported values: ${TRAINING_TYPES_CLI.join(", ")} (default: ${DEFAULT_TRAINING_TYPE}).`,
299301
);
300302
}
301-
// dpo / dpo-lora → "dpo" schema (strict chosen/rejected); else ChatML.
302-
const datasetSchema: DatasetSchema = trainingType.startsWith("dpo") ? "dpo" : "chatml";
303+
// dpo / dpo-lora → "dpo" schema (strict chosen/rejected); cpt → "cpt"
304+
// (raw {text} records); else ChatML ({messages}).
305+
const datasetSchema: DatasetSchema = trainingType.startsWith("dpo")
306+
? "dpo"
307+
: trainingType === "cpt"
308+
? "cpt"
309+
: "chatml";
303310

304311
const training = await analyzeDatasetTokens(config, datasetsRaw!, "datasets", datasetSchema);
305312
const trainingFileIds = training.fileIds;
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"text":"大型语言模型(LLM)是深度学习领域中近年来最受关注的方向之一。"}
2+
{"text":"持续预训练(CPT)旨在已有模型的基础上,注入领域语料以提升下游能力。"}

packages/cli/tests/e2e/dataset.e2e.test.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,44 @@ describe.skipIf(!isDashScopeE2EReady())("e2e: dataset (offline)", () => {
103103
expect(data.stats.totalRecords).toBe(2);
104104
});
105105

106+
test("dataset validate 自动识别 CPT 并校验 {text} 记录", async () => {
107+
// No --schema: a record carrying `text` (and no `messages`) is auto-detected
108+
// as CPT and the valid fixture passes.
109+
const file = join(__dirname, ".dataset-cpt-valid.jsonl");
110+
const { stdout, stderr, exitCode } = await runCli([
111+
"dataset",
112+
"validate",
113+
"--file",
114+
file,
115+
"--output",
116+
"json",
117+
]);
118+
expect(exitCode, stderr).toBe(0);
119+
const data = parseStdoutJson<{ valid: boolean; stats: { totalRecords?: number } }>(stdout);
120+
expect(data.valid).toBe(true);
121+
expect(data.stats.totalRecords).toBe(2);
122+
});
123+
124+
test("dataset validate --schema cpt 拒绝缺失 text 的记录", async () => {
125+
const file = join(__dirname, ".dataset-valid.jsonl"); // SFT {messages}, no text
126+
const { stdout, exitCode } = await runCli([
127+
"dataset",
128+
"validate",
129+
"--file",
130+
file,
131+
"--schema",
132+
"cpt",
133+
"--output",
134+
"json",
135+
]);
136+
expect(exitCode).not.toBe(0);
137+
const data = parseStdoutJson<{ valid: boolean; errors: { code: string; path?: string }[] }>(
138+
stdout,
139+
);
140+
expect(data.valid).toBe(false);
141+
expect(data.errors.map((e) => e.code)).toContain("MISSING_TEXT");
142+
});
143+
106144
test("dataset validate --schema dpo 拒绝缺失 rejected 的记录", async () => {
107145
const file = join(__dirname, ".dataset-dpo-invalid.jsonl");
108146
const { stdout, exitCode } = await runCli([

packages/core/src/dataset/validate/common.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,11 @@ export function emptyStats(): ValidationStats {
7575
export function parseDatasetSchemaFlag(value: string | undefined): DatasetSchema | undefined {
7676
if (value === undefined || value.trim() === "") return undefined;
7777
const v = value.trim();
78-
if (v === "chatml" || v === "dpo") return v;
78+
if (v === "chatml" || v === "dpo" || v === "cpt") return v;
7979
throw new BailianError(
80-
`Unsupported --schema "${value}". Supported: chatml, dpo.`,
80+
`Unsupported --schema "${value}". Supported: chatml, dpo, cpt.`,
8181
ExitCode.USAGE,
82-
`Omit --schema to auto-detect per record (a record with chosen/rejected is treated as DPO).`,
82+
`Omit --schema to auto-detect per record (chosen/rejected → DPO, text → CPT, else ChatML).`,
8383
);
8484
}
8585

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/**
2+
* CPT record schema — `{"text": "..."}` (continual pre-training).
3+
*
4+
* Unlike ChatML/DPO, CPT feeds raw continuation text rather than a
5+
* `messages[]` conversation. The platform's CPT format is one JSON object per
6+
* line carrying a single `text` field. This spec enforces exactly that shape
7+
* so a CPT job (`--training-type cpt`) fails fast at validate time instead of
8+
* being forced through the ChatML inspector and rejected for a missing
9+
* `messages` field it was never meant to carry.
10+
*
11+
* Auto-detect deliberately matches only when `text` is present AND `messages`
12+
* is absent — so an SFT record that happens to carry a `text` field still
13+
* routes to ChatML, and a mixed record (both `text` and `messages`) is left
14+
* for the ChatML catch-all rather than silently swallowed as CPT.
15+
*/
16+
import { makeIssue } from "../common.ts";
17+
import type { ValidationIssue } from "../types.ts";
18+
import type { RecordSchemaSpec } from "./types.ts";
19+
20+
function inspectCPTRecord(record: Record<string, unknown>, lineNo: number): ValidationIssue[] {
21+
const out: ValidationIssue[] = [];
22+
if (!("text" in record)) {
23+
out.push(
24+
makeIssue("error", "MISSING_TEXT", `Required field "text" is missing.`, {
25+
line: lineNo,
26+
path: "text",
27+
}),
28+
);
29+
return out;
30+
}
31+
const text = record.text;
32+
if (typeof text !== "string") {
33+
out.push(
34+
makeIssue("error", "INVALID_TEXT", `"text" must be a string (got ${typeof text}).`, {
35+
line: lineNo,
36+
path: "text",
37+
}),
38+
);
39+
return out;
40+
}
41+
if (text.trim().length === 0) {
42+
out.push(
43+
makeIssue("error", "EMPTY_TEXT", `"text" must not be empty / whitespace-only.`, {
44+
line: lineNo,
45+
path: "text",
46+
}),
47+
);
48+
}
49+
return out;
50+
}
51+
52+
/**
53+
* CPT schema. Auto-detect: a record is treated as CPT if it carries a `text`
54+
* field and no `messages` field. Placed after DPO (which keys off
55+
* chosen/rejected) and before ChatML (the catch-all), so the three schemas
56+
* partition cleanly by their distinguishing field.
57+
*/
58+
export const cptSchema: RecordSchemaSpec = {
59+
name: "cpt",
60+
detect: (record) => "text" in record && !("messages" in record),
61+
inspect: inspectCPTRecord,
62+
};

packages/core/src/dataset/validate/schemas/index.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,13 @@
1414
import type { DatasetSchema } from "../types.ts";
1515
import type { RecordSchemaSpec } from "./types.ts";
1616
import { chatmlSchema } from "./chatml.ts";
17+
import { cptSchema } from "./cpt.ts";
1718
import { dpoSchema } from "./dpo.ts";
1819

19-
// Order matters: DPO before ChatML (ChatML is the catch-all fallback).
20-
export const RECORD_SCHEMAS: RecordSchemaSpec[] = [dpoSchema, chatmlSchema];
20+
// Order matters: DPO (chosen/rejected) and CPT (text) before ChatML (the
21+
// catch-all fallback). Each keys off a distinguishing field so the three
22+
// partition cleanly — DPO never looks like CPT, etc.
23+
export const RECORD_SCHEMAS: RecordSchemaSpec[] = [dpoSchema, cptSchema, chatmlSchema];
2124

2225
/**
2326
* Pick the right schema for a single parsed record.

packages/core/src/dataset/validate/types.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,20 @@ export interface ValidateOpts {
2222
* - `"chatml"` — `{messages: [...]}` (SFT). `chosen`/`rejected` ignored.
2323
* - `"dpo"` — `{messages: [...], chosen: {role,content}, rejected: {...}}`.
2424
* Every record MUST carry `chosen` + `rejected`.
25+
* - `"cpt"` — `{text: "..."}` (continual pre-training). Raw text only,
26+
* no `messages[]`.
2527
* - `undefined` — auto-detect per record: a record with `chosen` or
26-
* `rejected` is validated as DPO, otherwise as ChatML.
27-
* `finetune create` sets this from `--training-type` (dpo* → "dpo") so a DPO
28-
* job with malformed preference pairs fails at validate time, not on the
28+
* `rejected` is validated as DPO, one with `text` (and no
29+
* `messages`) as CPT, otherwise as ChatML.
30+
* `finetune create` sets this from `--training-type` (dpo* → "dpo",
31+
* cpt → "cpt") so a malformed dataset fails at validate time, not on the
2932
* platform ten minutes in.
3033
*/
3134
schema?: DatasetSchema;
3235
}
3336

3437
/** The schemas a `.jsonl` record can be validated against. */
35-
export type DatasetSchema = "chatml" | "dpo";
38+
export type DatasetSchema = "chatml" | "dpo" | "cpt";
3639

3740
export type ValidationSeverity = "error" | "warning";
3841

packages/core/tests/dataset-validate.test.ts

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,70 @@ describe("validateDataset — DPO schema", () => {
109109
});
110110
});
111111

112+
describe("validateDataset — CPT schema", () => {
113+
const CPT_OK = '{"text":"The quick brown fox jumps over the lazy dog."}';
114+
115+
test("valid CPT record passes under auto-detect and --schema cpt", async () => {
116+
const p = file("cpt_ok.jsonl", [CPT_OK]);
117+
const auto = await validateDataset(p, { fullValidate: true });
118+
expect(auto.valid).toBe(true);
119+
const cpt = await validateDataset(p, { fullValidate: true, schema: "cpt" });
120+
expect(cpt.valid).toBe(true);
121+
});
122+
123+
test("missing text → MISSING_TEXT under --schema cpt", async () => {
124+
const p = file("cpt_no_text.jsonl", ['{"title":"doc"}']);
125+
const r = await validateDataset(p, { fullValidate: true, schema: "cpt" });
126+
expect(r.valid).toBe(false);
127+
expect(codes(r).errors).toContain("MISSING_TEXT");
128+
});
129+
130+
test("non-string text → INVALID_TEXT", async () => {
131+
const p = file("cpt_bad_text.jsonl", ['{"text":42}']);
132+
const r = await validateDataset(p, { fullValidate: true, schema: "cpt" });
133+
expect(r.valid).toBe(false);
134+
expect(codes(r).errors).toContain("INVALID_TEXT");
135+
});
136+
137+
test("empty / whitespace-only text → EMPTY_TEXT", async () => {
138+
const p = file("cpt_empty.jsonl", ['{"text":" "}']);
139+
const r = await validateDataset(p, { fullValidate: true, schema: "cpt" });
140+
expect(r.valid).toBe(false);
141+
expect(codes(r).errors).toContain("EMPTY_TEXT");
142+
});
143+
144+
test("auto-detect routes a {text} record to CPT, not ChatML", async () => {
145+
// A CPT record has no `messages`; under auto-detect it must NOT produce a
146+
// ChatML MISSING_MESSAGES error — it should be validated as CPT and pass.
147+
const p = file("cpt_auto.jsonl", [CPT_OK]);
148+
const r = await validateDataset(p, { fullValidate: true });
149+
expect(r.valid).toBe(true);
150+
expect(codes(r).errors).not.toContain("MISSING_MESSAGES");
151+
});
152+
153+
test("SFT record with a stray text field still routes to ChatML", async () => {
154+
// {messages, text} is ambiguous; CPT detect requires text AND no messages,
155+
// so this falls through to ChatML and validates as SFT (text ignored).
156+
const p = file("mixed.jsonl", [
157+
'{"messages":[{"role":"user","content":"hi"},{"role":"assistant","content":"yo"}],"text":"noise"}',
158+
]);
159+
const r = await validateDataset(p, { fullValidate: true });
160+
expect(r.valid).toBe(true);
161+
expect(codes(r).errors).toEqual([]);
162+
});
163+
});
164+
112165
describe("parseDatasetSchemaFlag", () => {
113166
test("undefined / empty → undefined (auto)", () => {
114167
expect(parseDatasetSchemaFlag(undefined)).toBeUndefined();
115168
expect(parseDatasetSchemaFlag("")).toBeUndefined();
116169
expect(parseDatasetSchemaFlag(" ")).toBeUndefined();
117170
});
118171

119-
test("chatml / dpo pass through", () => {
172+
test("chatml / dpo / cpt pass through", () => {
120173
expect(parseDatasetSchemaFlag("chatml")).toBe("chatml");
121174
expect(parseDatasetSchemaFlag("dpo")).toBe("dpo");
175+
expect(parseDatasetSchemaFlag("cpt")).toBe("cpt");
122176
expect(parseDatasetSchemaFlag(" dpo ")).toBe("dpo");
123177
});
124178

0 commit comments

Comments
 (0)