33 detectOutputFormat ,
44 uploadDataset ,
55 validateDataset ,
6+ parseDatasetSchemaFlag ,
67 MAX_DATASET_BYTES ,
78 BailianError ,
89 ExitCode ,
@@ -28,7 +29,8 @@ function formatIssue(issue: ValidationResult["errors"][number]): string {
2829export default defineCommand ( {
2930 name : "dataset upload" ,
3031 description : "Upload a dataset file (.jsonl) to Bailian" ,
31- usage : "bl dataset upload --file <path> [--purpose <name>] [--no-validate] [--full-validate]" ,
32+ usage :
33+ "bl dataset upload --file <path> [--purpose <name>] [--schema <chatml|dpo>] [--no-validate] [--full-validate]" ,
3234 options : [
3335 {
3436 flag : "--file <path>" ,
@@ -39,6 +41,11 @@ export default defineCommand({
3941 flag : "--purpose <name>" ,
4042 description : 'Dataset purpose tag (default: "fine-tune"; e.g. "evaluation")' ,
4143 } ,
44+ {
45+ flag : "--schema <s>" ,
46+ description :
47+ 'Record schema: "chatml" (SFT) or "dpo" (requires chosen/rejected). Default auto-detects per record.' ,
48+ } ,
4249 {
4350 flag : "--no-validate" ,
4451 description : "Skip the local JSONL pre-flight check (not recommended)" ,
@@ -52,15 +59,19 @@ export default defineCommand({
5259 ] ,
5360 examples : [
5461 "bl dataset upload --file train.jsonl" ,
62+ "bl dataset upload --file dpo.jsonl --schema dpo" ,
5563 "bl dataset upload --file eval.jsonl --purpose evaluation" ,
5664 "bl dataset upload --file train.jsonl --full-validate" ,
5765 "bl dataset upload --file train.jsonl --no-validate" ,
5866 ] ,
5967 notes : [
60- "Only .jsonl is supported in this release. The default validator expects a" ,
61- 'ChatML schema (each line a JSON object with a "messages" array). Other' ,
62- "purposes may carry a different schema in the future and would be served" ,
63- "by a purpose-specific validator at that point." ,
68+ "Only .jsonl is supported in this release. Two record schemas are" ,
69+ "recognized: chatml = {messages:[...]} (SFT); dpo = {messages:[...]," ,
70+ "chosen, rejected} where chosen/rejected are single assistant messages." ,
71+ "With no --schema, a record carrying chosen/rejected is validated as DPO;" ,
72+ "pass --schema dpo to require it on every record, or --schema chatml to" ,
73+ "ignore preference fields. Other purposes may carry a different schema in" ,
74+ "the future and would be served by a purpose-specific validator." ,
6475 "The dataset upload cap is 300MB per file." ,
6576 "Upload uses the OpenAI-compatible /compatible-mode/v1/files endpoint so" ,
6677 "the purpose tag is persisted (the DashScope-native /api/v1/files drops it)." ,
@@ -72,10 +83,11 @@ export default defineCommand({
7283 const purpose = ( flags . purpose as string | undefined ) || "fine-tune" ;
7384 const skipValidate = Boolean ( flags . noValidate ) ;
7485 const fullValidate = Boolean ( flags . fullValidate ) ;
86+ const schema = parseDatasetSchemaFlag ( flags . schema as string | undefined ) ;
7587 const format = detectOutputFormat ( config . output ) ;
7688
7789 if ( ! skipValidate ) {
78- const result = await validateDataset ( filePath ! , { fullValidate } ) ;
90+ const result = await validateDataset ( filePath ! , { fullValidate, schema } ) ;
7991 if ( ! result . valid ) {
8092 const lines = [
8193 `Dataset validation failed for ${ filePath } ` ,
@@ -112,6 +124,7 @@ export default defineCommand({
112124 purpose,
113125 max_bytes : MAX_DATASET_BYTES ,
114126 validate : ! skipValidate ,
127+ schema : schema ?? "auto" ,
115128 } ,
116129 format ,
117130 ) ;
0 commit comments