feat: add cpt dataset type

gujieye · gujieye · commit 18d5c420df44 · 2026-06-25T19:59:17.000+08:00
diff --git a/packages/cli/src/commands/dataset/upload.ts b/packages/cli/src/commands/dataset/upload.ts
@@ -19,7 +19,7 @@ export default defineCommand({
   name: "dataset upload",
   description: "Upload a dataset file (.jsonl) to Bailian",
   usage:
-    "bl dataset upload --file <path> [--purpose <name>] [--schema <chatml|dpo>] [--no-validate] [--full-validate]",
+    "bl dataset upload --file <path> [--purpose <name>] [--schema <chatml|dpo|cpt>] [--no-validate] [--full-validate]",
   options: [
     {
       flag: "--file <path>",
@@ -33,7 +33,7 @@ export default defineCommand({
     {
       flag: "--schema <s>",
       description:
-        'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.',
+        'Record schema: "chatml" (SFT), "dpo" (chosen/rejected), or "cpt" (raw text). Default auto-detects per record.',
     },
     {
       flag: "--no-validate",
@@ -49,17 +49,20 @@ export default defineCommand({
   examples: [
     "bl dataset upload --file train.jsonl",
     "bl dataset upload --file dpo.jsonl --schema dpo",
+    "bl dataset upload --file cpt.jsonl --schema cpt",
     "bl dataset upload --file eval.jsonl --purpose evaluation",
     "bl dataset upload --file train.jsonl --full-validate",
     "bl dataset upload --file train.jsonl --no-validate",
   ],
   notes: [
-    "Only .jsonl is supported in this release. Two record schemas are",
+    "Only .jsonl is supported in this release. Three record schemas are",
     "recognized: chatml = {messages:[...]} (SFT); dpo = {messages:[...],",
-    "chosen, rejected} where chosen/rejected are single assistant messages.",
-    "With no --schema, a record carrying chosen/rejected is validated as DPO;",
-    "pass --schema dpo to require it on every record, or --schema chatml to",
-    "ignore preference fields. Other purposes may carry a different schema in",
+    "chosen, rejected} where chosen/rejected are single assistant messages;",
+    'cpt = {text:"..."} (continual pre-training, raw text). With no --schema,',
+    "a record carrying chosen/rejected is validated as DPO, one with text (and",
+    "no messages) as CPT, otherwise as ChatML. Pass --schema dpo / cpt to",
+    "require that shape on every record, or --schema chatml to ignore the",
+    "preference / text fields. Other purposes may carry a different schema in",
     "the future and would be served by a purpose-specific validator.",
     "The dataset upload cap is 300MB per file.",
     "Upload uses the OpenAI-compatible /compatible-mode/v1/files endpoint so",
diff --git a/packages/cli/src/commands/dataset/validate.ts b/packages/cli/src/commands/dataset/validate.ts
@@ -28,7 +28,7 @@ export default defineCommand({
   description: "Locally validate a dataset file (.jsonl) without uploading",
   // 纯本地校验，不触网、不需 API key（与 `pipeline validate` 一致）。
   skipDefaultApiKeySetup: true,
-  usage: "bl dataset validate --file <path> [--full-validate] [--schema <chatml|dpo>]",
+  usage: "bl dataset validate --file <path> [--full-validate] [--schema <chatml|dpo|cpt>]",
   options: [
     { flag: "--file <path>", description: "Local .jsonl dataset file", required: true },
     {
@@ -39,24 +39,26 @@ export default defineCommand({
     {
       flag: "--schema <s>",
       description:
-        'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.',
+        'Record schema: "chatml" (SFT), "dpo" (chosen/rejected), or "cpt" (raw text). Default auto-detects per record.',
     },
   ],
   examples: [
     "bl dataset validate --file train.jsonl",
     "bl dataset validate --file dpo.jsonl --schema dpo",
+    "bl dataset validate --file cpt.jsonl --schema cpt",
     "bl dataset validate --file eval.jsonl --full-validate",
     "bl dataset validate --file train.jsonl --output json",
   ],
   notes: [
     "Default scan: every line gets a structural check, then ~160 lines (front 50,",
     "evenly spaced 100, last 10) are JSON.parsed against the active schema.",
     "Schemas: chatml = {messages:[...]} (SFT); dpo = {messages:[...], chosen,",
-    "rejected} where chosen/rejected are single assistant messages. With no",
-    "--schema, a record carrying chosen/rejected is validated as DPO; pass",
-    "--schema dpo to require chosen/rejected on every record (strict), or",
-    "--schema chatml to ignore preference fields.",
-    "Use --full-validate to JSON.parse every line.",
+    "rejected} where chosen/rejected are single assistant messages; cpt =",
+    '{text:"..."} (continual pre-training, raw text). With no --schema, a',
+    "record carrying chosen/rejected is validated as DPO, one with text (and no",
+    "messages) as CPT, otherwise as ChatML. Pass --schema dpo / cpt to require",
+    "that shape on every record (strict), or --schema chatml to ignore the",
+    "preference / text fields. Use --full-validate to JSON.parse every line.",
   ],
   async run(config: Config, flags: GlobalFlags) {
     const filePath = flags.file as string | undefined;
diff --git a/packages/cli/src/commands/finetune/create.ts b/packages/cli/src/commands/finetune/create.ts
@@ -275,6 +275,8 @@ export default defineCommand({
     "--datasets / --validations accept either file-ids (from `bl dataset",
     "upload`) or local .jsonl paths. Local paths are validated and uploaded",
     "first, then their file-ids are submitted — a one-step upload-and-train.",
+    "Dataset record schema is chosen from --training-type: dpo* → {messages,",
+    "chosen, rejected}; cpt → {text} (raw pre-training text); else {messages}.",
     "Pre-submit gate: if the training dataset's sample count is not greater",
     "than batch_size, the job is rejected before upload or quota consumption",
     "(the platform would otherwise fail ~10 min in, after data processing).",
@@ -298,8 +300,13 @@ export default defineCommand({
         `Supported values: ${TRAINING_TYPES_CLI.join(", ")} (default: ${DEFAULT_TRAINING_TYPE}).`,
       );
     }
-    // dpo / dpo-lora → "dpo" schema (strict chosen/rejected); else ChatML.
-    const datasetSchema: DatasetSchema = trainingType.startsWith("dpo") ? "dpo" : "chatml";
+    // dpo / dpo-lora → "dpo" schema (strict chosen/rejected); cpt → "cpt"
+    // (raw {text} records); else ChatML ({messages}).
+    const datasetSchema: DatasetSchema = trainingType.startsWith("dpo")
+      ? "dpo"
+      : trainingType === "cpt"
+        ? "cpt"
+        : "chatml";
 
     const training = await analyzeDatasetTokens(config, datasetsRaw!, "datasets", datasetSchema);
     const trainingFileIds = training.fileIds;
diff --git a/packages/cli/tests/e2e/.dataset-cpt-valid.jsonl b/packages/cli/tests/e2e/.dataset-cpt-valid.jsonl
@@ -0,0 +1,2 @@
+{"text":"大型语言模型（LLM）是深度学习领域中近年来最受关注的方向之一。"}
+{"text":"持续预训练（CPT）旨在已有模型的基础上，注入领域语料以提升下游能力。"}
diff --git a/packages/cli/tests/e2e/dataset.e2e.test.ts b/packages/cli/tests/e2e/dataset.e2e.test.ts
@@ -103,6 +103,44 @@ describe.skipIf(!isDashScopeE2EReady())("e2e: dataset (offline)", () => {
     expect(data.stats.totalRecords).toBe(2);
   });
 
+  test("dataset validate 自动识别 CPT 并校验 {text} 记录", async () => {
+    // No --schema: a record carrying `text` (and no `messages`) is auto-detected
+    // as CPT and the valid fixture passes.
+    const file = join(__dirname, ".dataset-cpt-valid.jsonl");
+    const { stdout, stderr, exitCode } = await runCli([
+      "dataset",
+      "validate",
+      "--file",
+      file,
+      "--output",
+      "json",
+    ]);
+    expect(exitCode, stderr).toBe(0);
+    const data = parseStdoutJson<{ valid: boolean; stats: { totalRecords?: number } }>(stdout);
+    expect(data.valid).toBe(true);
+    expect(data.stats.totalRecords).toBe(2);
+  });
+
+  test("dataset validate --schema cpt 拒绝缺失 text 的记录", async () => {
+    const file = join(__dirname, ".dataset-valid.jsonl"); // SFT {messages}, no text
+    const { stdout, exitCode } = await runCli([
+      "dataset",
+      "validate",
+      "--file",
+      file,
+      "--schema",
+      "cpt",
+      "--output",
+      "json",
+    ]);
+    expect(exitCode).not.toBe(0);
+    const data = parseStdoutJson<{ valid: boolean; errors: { code: string; path?: string }[] }>(
+      stdout,
+    );
+    expect(data.valid).toBe(false);
+    expect(data.errors.map((e) => e.code)).toContain("MISSING_TEXT");
+  });
+
   test("dataset validate --schema dpo 拒绝缺失 rejected 的记录", async () => {
     const file = join(__dirname, ".dataset-dpo-invalid.jsonl");
     const { stdout, exitCode } = await runCli([
diff --git a/packages/core/src/dataset/validate/common.ts b/packages/core/src/dataset/validate/common.ts
@@ -75,11 +75,11 @@ export function emptyStats(): ValidationStats {
 export function parseDatasetSchemaFlag(value: string | undefined): DatasetSchema | undefined {
   if (value === undefined || value.trim() === "") return undefined;
   const v = value.trim();
-  if (v === "chatml" || v === "dpo") return v;
+  if (v === "chatml" || v === "dpo" || v === "cpt") return v;
   throw new BailianError(
-    `Unsupported --schema "${value}". Supported: chatml, dpo.`,
+    `Unsupported --schema "${value}". Supported: chatml, dpo, cpt.`,
     ExitCode.USAGE,
-    `Omit --schema to auto-detect per record (a record with chosen/rejected is treated as DPO).`,
+    `Omit --schema to auto-detect per record (chosen/rejected → DPO, text → CPT, else ChatML).`,
   );
 }
 
diff --git a/packages/core/src/dataset/validate/schemas/cpt.ts b/packages/core/src/dataset/validate/schemas/cpt.ts
@@ -0,0 +1,62 @@
+/**
+ * CPT record schema — `{"text": "..."}` (continual pre-training).
+ *
+ * Unlike ChatML/DPO, CPT feeds raw continuation text rather than a
+ * `messages[]` conversation. The platform's CPT format is one JSON object per
+ * line carrying a single `text` field. This spec enforces exactly that shape
+ * so a CPT job (`--training-type cpt`) fails fast at validate time instead of
+ * being forced through the ChatML inspector and rejected for a missing
+ * `messages` field it was never meant to carry.
+ *
+ * Auto-detect deliberately matches only when `text` is present AND `messages`
+ * is absent — so an SFT record that happens to carry a `text` field still
+ * routes to ChatML, and a mixed record (both `text` and `messages`) is left
+ * for the ChatML catch-all rather than silently swallowed as CPT.
+ */
+import { makeIssue } from "../common.ts";
+import type { ValidationIssue } from "../types.ts";
+import type { RecordSchemaSpec } from "./types.ts";
+
+function inspectCPTRecord(record: Record<string, unknown>, lineNo: number): ValidationIssue[] {
+  const out: ValidationIssue[] = [];
+  if (!("text" in record)) {
+    out.push(
+      makeIssue("error", "MISSING_TEXT", `Required field "text" is missing.`, {
+        line: lineNo,
+        path: "text",
+      }),
+    );
+    return out;
+  }
+  const text = record.text;
+  if (typeof text !== "string") {
+    out.push(
+      makeIssue("error", "INVALID_TEXT", `"text" must be a string (got ${typeof text}).`, {
+        line: lineNo,
+        path: "text",
+      }),
+    );
+    return out;
+  }
+  if (text.trim().length === 0) {
+    out.push(
+      makeIssue("error", "EMPTY_TEXT", `"text" must not be empty / whitespace-only.`, {
+        line: lineNo,
+        path: "text",
+      }),
+    );
+  }
+  return out;
+}
+
+/**
+ * CPT schema. Auto-detect: a record is treated as CPT if it carries a `text`
+ * field and no `messages` field. Placed after DPO (which keys off
+ * chosen/rejected) and before ChatML (the catch-all), so the three schemas
+ * partition cleanly by their distinguishing field.
+ */
+export const cptSchema: RecordSchemaSpec = {
+  name: "cpt",
+  detect: (record) => "text" in record && !("messages" in record),
+  inspect: inspectCPTRecord,
+};
diff --git a/packages/core/src/dataset/validate/schemas/index.ts b/packages/core/src/dataset/validate/schemas/index.ts
@@ -14,10 +14,13 @@
 import type { DatasetSchema } from "../types.ts";
 import type { RecordSchemaSpec } from "./types.ts";
 import { chatmlSchema } from "./chatml.ts";
+import { cptSchema } from "./cpt.ts";
 import { dpoSchema } from "./dpo.ts";
 
-// Order matters: DPO before ChatML (ChatML is the catch-all fallback).
-export const RECORD_SCHEMAS: RecordSchemaSpec[] = [dpoSchema, chatmlSchema];
+// Order matters: DPO (chosen/rejected) and CPT (text) before ChatML (the
+// catch-all fallback). Each keys off a distinguishing field so the three
+// partition cleanly — DPO never looks like CPT, etc.
+export const RECORD_SCHEMAS: RecordSchemaSpec[] = [dpoSchema, cptSchema, chatmlSchema];
 
 /**
  * Pick the right schema for a single parsed record.
diff --git a/packages/core/src/dataset/validate/types.ts b/packages/core/src/dataset/validate/types.ts
@@ -22,17 +22,20 @@ export interface ValidateOpts {
    *   - `"chatml"` — `{messages: [...]}` (SFT). `chosen`/`rejected` ignored.
    *   - `"dpo"`    — `{messages: [...], chosen: {role,content}, rejected: {...}}`.
    *                   Every record MUST carry `chosen` + `rejected`.
+   *   - `"cpt"`    — `{text: "..."}` (continual pre-training). Raw text only,
+   *                   no `messages[]`.
    *   - `undefined` — auto-detect per record: a record with `chosen` or
-   *                   `rejected` is validated as DPO, otherwise as ChatML.
-   * `finetune create` sets this from `--training-type` (dpo* → "dpo") so a DPO
-   * job with malformed preference pairs fails at validate time, not on the
+   *                   `rejected` is validated as DPO, one with `text` (and no
+   *                   `messages`) as CPT, otherwise as ChatML.
+   * `finetune create` sets this from `--training-type` (dpo* → "dpo",
+   * cpt → "cpt") so a malformed dataset fails at validate time, not on the
    * platform ten minutes in.
    */
   schema?: DatasetSchema;
 }
 
 /** The schemas a `.jsonl` record can be validated against. */
-export type DatasetSchema = "chatml" | "dpo";
+export type DatasetSchema = "chatml" | "dpo" | "cpt";
 
 export type ValidationSeverity = "error" | "warning";
 
diff --git a/packages/core/tests/dataset-validate.test.ts b/packages/core/tests/dataset-validate.test.ts
@@ -109,16 +109,70 @@ describe("validateDataset — DPO schema", () => {
   });
 });
 
+describe("validateDataset — CPT schema", () => {
+  const CPT_OK = '{"text":"The quick brown fox jumps over the lazy dog."}';
+
+  test("valid CPT record passes under auto-detect and --schema cpt", async () => {
+    const p = file("cpt_ok.jsonl", [CPT_OK]);
+    const auto = await validateDataset(p, { fullValidate: true });
+    expect(auto.valid).toBe(true);
+    const cpt = await validateDataset(p, { fullValidate: true, schema: "cpt" });
+    expect(cpt.valid).toBe(true);
+  });
+
+  test("missing text → MISSING_TEXT under --schema cpt", async () => {
+    const p = file("cpt_no_text.jsonl", ['{"title":"doc"}']);
+    const r = await validateDataset(p, { fullValidate: true, schema: "cpt" });
+    expect(r.valid).toBe(false);
+    expect(codes(r).errors).toContain("MISSING_TEXT");
+  });
+
+  test("non-string text → INVALID_TEXT", async () => {
+    const p = file("cpt_bad_text.jsonl", ['{"text":42}']);
+    const r = await validateDataset(p, { fullValidate: true, schema: "cpt" });
+    expect(r.valid).toBe(false);
+    expect(codes(r).errors).toContain("INVALID_TEXT");
+  });
+
+  test("empty / whitespace-only text → EMPTY_TEXT", async () => {
+    const p = file("cpt_empty.jsonl", ['{"text":"   "}']);
+    const r = await validateDataset(p, { fullValidate: true, schema: "cpt" });
+    expect(r.valid).toBe(false);
+    expect(codes(r).errors).toContain("EMPTY_TEXT");
+  });
+
+  test("auto-detect routes a {text} record to CPT, not ChatML", async () => {
+    // A CPT record has no `messages`; under auto-detect it must NOT produce a
+    // ChatML MISSING_MESSAGES error — it should be validated as CPT and pass.
+    const p = file("cpt_auto.jsonl", [CPT_OK]);
+    const r = await validateDataset(p, { fullValidate: true });
+    expect(r.valid).toBe(true);
+    expect(codes(r).errors).not.toContain("MISSING_MESSAGES");
+  });
+
+  test("SFT record with a stray text field still routes to ChatML", async () => {
+    // {messages, text} is ambiguous; CPT detect requires text AND no messages,
+    // so this falls through to ChatML and validates as SFT (text ignored).
+    const p = file("mixed.jsonl", [
+      '{"messages":[{"role":"user","content":"hi"},{"role":"assistant","content":"yo"}],"text":"noise"}',
+    ]);
+    const r = await validateDataset(p, { fullValidate: true });
+    expect(r.valid).toBe(true);
+    expect(codes(r).errors).toEqual([]);
+  });
+});
+
 describe("parseDatasetSchemaFlag", () => {
   test("undefined / empty → undefined (auto)", () => {
     expect(parseDatasetSchemaFlag(undefined)).toBeUndefined();
     expect(parseDatasetSchemaFlag("")).toBeUndefined();
     expect(parseDatasetSchemaFlag("  ")).toBeUndefined();
   });
 
-  test("chatml / dpo pass through", () => {
+  test("chatml / dpo / cpt pass through", () => {
     expect(parseDatasetSchemaFlag("chatml")).toBe("chatml");
     expect(parseDatasetSchemaFlag("dpo")).toBe("dpo");
+    expect(parseDatasetSchemaFlag("cpt")).toBe("cpt");
     expect(parseDatasetSchemaFlag("  dpo ")).toBe("dpo");
   });
 
diff --git a/skills/bailian-cli/reference/dataset.md b/skills/bailian-cli/reference/dataset.md
diff --git a/skills/bailian-cli/reference/finetune.md b/skills/bailian-cli/reference/finetune.md

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"text":"大型语言模型（LLM）是深度学习领域中近年来最受关注的方向之一。"}`
	`2`	`+{"text":"持续预训练（CPT）旨在已有模型的基础上，注入领域语料以提升下游能力。"}`