@@ -109,16 +109,70 @@ describe("validateDataset — DPO schema", () => {
109109 } ) ;
110110} ) ;
111111
112+ describe ( "validateDataset — CPT schema" , ( ) => {
113+ const CPT_OK = '{"text":"The quick brown fox jumps over the lazy dog."}' ;
114+
115+ test ( "valid CPT record passes under auto-detect and --schema cpt" , async ( ) => {
116+ const p = file ( "cpt_ok.jsonl" , [ CPT_OK ] ) ;
117+ const auto = await validateDataset ( p , { fullValidate : true } ) ;
118+ expect ( auto . valid ) . toBe ( true ) ;
119+ const cpt = await validateDataset ( p , { fullValidate : true , schema : "cpt" } ) ;
120+ expect ( cpt . valid ) . toBe ( true ) ;
121+ } ) ;
122+
123+ test ( "missing text → MISSING_TEXT under --schema cpt" , async ( ) => {
124+ const p = file ( "cpt_no_text.jsonl" , [ '{"title":"doc"}' ] ) ;
125+ const r = await validateDataset ( p , { fullValidate : true , schema : "cpt" } ) ;
126+ expect ( r . valid ) . toBe ( false ) ;
127+ expect ( codes ( r ) . errors ) . toContain ( "MISSING_TEXT" ) ;
128+ } ) ;
129+
130+ test ( "non-string text → INVALID_TEXT" , async ( ) => {
131+ const p = file ( "cpt_bad_text.jsonl" , [ '{"text":42}' ] ) ;
132+ const r = await validateDataset ( p , { fullValidate : true , schema : "cpt" } ) ;
133+ expect ( r . valid ) . toBe ( false ) ;
134+ expect ( codes ( r ) . errors ) . toContain ( "INVALID_TEXT" ) ;
135+ } ) ;
136+
137+ test ( "empty / whitespace-only text → EMPTY_TEXT" , async ( ) => {
138+ const p = file ( "cpt_empty.jsonl" , [ '{"text":" "}' ] ) ;
139+ const r = await validateDataset ( p , { fullValidate : true , schema : "cpt" } ) ;
140+ expect ( r . valid ) . toBe ( false ) ;
141+ expect ( codes ( r ) . errors ) . toContain ( "EMPTY_TEXT" ) ;
142+ } ) ;
143+
144+ test ( "auto-detect routes a {text} record to CPT, not ChatML" , async ( ) => {
145+ // A CPT record has no `messages`; under auto-detect it must NOT produce a
146+ // ChatML MISSING_MESSAGES error — it should be validated as CPT and pass.
147+ const p = file ( "cpt_auto.jsonl" , [ CPT_OK ] ) ;
148+ const r = await validateDataset ( p , { fullValidate : true } ) ;
149+ expect ( r . valid ) . toBe ( true ) ;
150+ expect ( codes ( r ) . errors ) . not . toContain ( "MISSING_MESSAGES" ) ;
151+ } ) ;
152+
153+ test ( "SFT record with a stray text field still routes to ChatML" , async ( ) => {
154+ // {messages, text} is ambiguous; CPT detect requires text AND no messages,
155+ // so this falls through to ChatML and validates as SFT (text ignored).
156+ const p = file ( "mixed.jsonl" , [
157+ '{"messages":[{"role":"user","content":"hi"},{"role":"assistant","content":"yo"}],"text":"noise"}' ,
158+ ] ) ;
159+ const r = await validateDataset ( p , { fullValidate : true } ) ;
160+ expect ( r . valid ) . toBe ( true ) ;
161+ expect ( codes ( r ) . errors ) . toEqual ( [ ] ) ;
162+ } ) ;
163+ } ) ;
164+
112165describe ( "parseDatasetSchemaFlag" , ( ) => {
113166 test ( "undefined / empty → undefined (auto)" , ( ) => {
114167 expect ( parseDatasetSchemaFlag ( undefined ) ) . toBeUndefined ( ) ;
115168 expect ( parseDatasetSchemaFlag ( "" ) ) . toBeUndefined ( ) ;
116169 expect ( parseDatasetSchemaFlag ( " " ) ) . toBeUndefined ( ) ;
117170 } ) ;
118171
119- test ( "chatml / dpo pass through" , ( ) => {
172+ test ( "chatml / dpo / cpt pass through" , ( ) => {
120173 expect ( parseDatasetSchemaFlag ( "chatml" ) ) . toBe ( "chatml" ) ;
121174 expect ( parseDatasetSchemaFlag ( "dpo" ) ) . toBe ( "dpo" ) ;
175+ expect ( parseDatasetSchemaFlag ( "cpt" ) ) . toBe ( "cpt" ) ;
122176 expect ( parseDatasetSchemaFlag ( " dpo " ) ) . toBe ( "dpo" ) ;
123177 } ) ;
124178
0 commit comments